Christopher Boomhower1, Stacey Fabricant2, Alex Frye1, David Mumford2, Michael Smith1, Lindsay Vitovsky1
1 Southern Methodist University, Dallas, TX, US 2 Penn Mutual Life Insurance Co, Horsham PA </i></b>
To begin our analysis, we need to load the data from our 89 source .txt files. Data is separated into two separate groups of files; Separation and Non-Separation, thus data is loaded in two separate phases, then unioned together. Once data is loaded, Steps taken to remove non-US observations or those with no specified occupation, no specified salary, or no specified length of service level. Of a total 8,423,336 observations, we end with 8,232,375 after removal of these observations.
## Import libraries
import pickle
import os
import psutil
import glob
import pandas as pd
import numpy as np
from IPython.display import display
import matplotlib
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
import seaborn as sns
import requests
import json
import missingno as msno
import prettytable
import math
from sklearn.preprocessing import MinMaxScaler, StandardScaler, label_binarize
from sklearn.multiclass import OneVsRestClassifier
from sklearn.utils import class_weight
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.model_selection import StratifiedKFold
from sklearn.cross_validation import cross_val_score
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import roc_auc_score
from scipy import interp
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import ShuffleSplit
from sklearn.metrics import log_loss
from sklearn.metrics import roc_auc_score
from datetime import datetime
from dateutil.parser import parse
from itertools import cycle
from sklearn import metrics as mt
from sklearn.feature_selection import chi2
import itertools
#Need to make sure you install the rpy2 package via following command in the Putty genuse41 console:
#python3 /usr/bin/pip install --user rpy2
#NOTE: If the above pip install does not work, try the following instead:
#python3 /usr/local/es7/lib/python3.5/site-packages/pip install --user rpy2
%load_ext rpy2.ipython
## Library Options
pd.options.mode.chained_assignment = None
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
## Pre-defined Functions for use later
def pickleObject(objectname, filename, filepath = "PickleJar/"):
fullpicklepath = "{0}{1}.pkl".format(filepath, filename)
# Create a variable to pickle and open it in write mode
picklefile = open(fullpicklepath, 'wb')
pickle.dump(objectname, picklefile)
picklefile.close()
def unpickleObject(filename, filepath = "PickleJar/"):
fullunpicklepath = "{0}{1}.pkl".format(filepath, filename)
# Create an variable to pickle and open it in write mode
unpicklefile = open(fullunpicklepath, 'rb')
unpickleObject = pickle.load(unpicklefile)
unpicklefile.close()
return unpickleObject
def clear_display():
from IPython.display import clear_output
## Pre-defined variables for use later
dataOPMPath = "dataOPM"
dataEMPPath = "dataEMP"
PickleJarPath = "PickleJar"
%%time
## Load OPMSeparation Files
OPMDataFiles = glob.glob(os.path.join(dataOPMPath, "*.txt"))
for i in range(0,len(OPMDataFiles)):
OPMDataFiles[i] = OPMDataFiles[i].replace("\\","/")
OPMDataList = []
for i,j in zip(OPMDataFiles,range(0,len(OPMDataFiles))):
OPMDataList.append(pd.read_csv(i, dtype = 'str'))
display(OPMDataList[j].head())
## Load the SEPDATA_FY2015 file into it's own object
indexes = [i for i,x in enumerate(OPMDataFiles) if x == 'dataOPM/SEPDATA_FY2015.txt']
OPMDataOrig = OPMDataList[indexes[0]]
%%time
#print(OPMDataFiles)
print(len(OPMDataOrig))
##### Merge / Modify Codes / Aggregate Attributes to be more descriptive per the metadata files
OPMDataMerged = OPMDataOrig.copy()
##AGYSUB - AGYTYP, AGY
indexes = [i for i,x in enumerate(OPMDataFiles) if x == 'dataOPM/DTagy.txt']
OPMDataMerged = OPMDataMerged.merge(OPMDataList[indexes[0]], on = 'AGYSUB', how = 'left')
##EFDate - quarter, month
indexes = [i for i,x in enumerate(OPMDataFiles) if x == 'dataOPM/DTefdate.txt']
OPMDataMerged = OPMDataMerged.merge(OPMDataList[indexes[0]], on = 'EFDATE', how = 'left')
##AGELVL - AGELVLT
indexes = [i for i,x in enumerate(OPMDataFiles) if x == 'dataOPM/DTagelvl.txt']
OPMDataMerged = OPMDataMerged.merge(OPMDataList[indexes[0]], on = 'AGELVL', how = 'left')
##LOSLVL - LOSLVLT
indexes = [i for i,x in enumerate(OPMDataFiles) if x == 'dataOPM/DTloslvl.txt']
OPMDataMerged = OPMDataMerged.merge(OPMDataList[indexes[0]], on = 'LOSLVL', how = 'left')
##LOC - LocTypeT, LocT
indexes = [i for i,x in enumerate(OPMDataFiles) if x == 'dataOPM/DTloc.txt']
OPMDataMerged = OPMDataMerged.merge(OPMDataList[indexes[0]], on = 'LOC', how = 'left')
##OCC - OCCTYPT, OCCFAM
indexes = [i for i,x in enumerate(OPMDataFiles) if x == 'dataOPM/DTocc.txt']
OPMDataMerged = OPMDataMerged.merge(OPMDataList[indexes[0]], on = 'OCC', how = 'left')
##PATCO - PATCOT
indexes = [i for i,x in enumerate(OPMDataFiles) if x == 'dataOPM/DTpatco.txt']
OPMDataMerged = OPMDataMerged.merge(OPMDataList[indexes[0]], on = 'PATCO', how = 'left')
##PPGRD - PayPlan, PPGroup, PPTYP
indexes = [i for i,x in enumerate(OPMDataFiles) if x == 'dataOPM/DTppgrd.txt']
OPMDataMerged = OPMDataMerged.merge(OPMDataList[indexes[0]], on = 'PPGRD', how = 'left')
##SALLVL - SALLVLT
indexes = [i for i,x in enumerate(OPMDataFiles) if x == 'dataOPM/DTsallvl.txt']
OPMDataMerged = OPMDataMerged.merge(OPMDataList[indexes[0]], on = 'SALLVL', how = 'left')
##TOA - TOATYP
indexes = [i for i,x in enumerate(OPMDataFiles) if x == 'dataOPM/DTtoa.txt']
OPMDataMerged = OPMDataMerged.merge(OPMDataList[indexes[0]], on = 'TOA', how = 'left')
##WORKSCH - WSTYPT
indexes = [i for i,x in enumerate(OPMDataFiles) if x == 'dataOPM/DTwrksch.txt']
OPMDataMerged = OPMDataMerged.merge(OPMDataList[indexes[0]], on = 'WORKSCH', how = 'left')
## Modify Data Types for numeric objects
OPMDataMerged["SALARY"] = OPMDataMerged["SALARY"].apply(pd.to_numeric)
OPMDataMerged["COUNT"] = OPMDataMerged["COUNT"].apply(pd.to_numeric)
OPMDataMerged["LOS"] = OPMDataMerged["LOS"].apply(pd.to_numeric)
print("Original SEP data size of: "+str(len(OPMDataMerged)))
print("Removing "+str(len(OPMDataMerged[OPMDataMerged["LOCTYP"] != "1"]))+" Non-US observations.")
## Remove Non-US Data
OPMDataMerged = OPMDataMerged[OPMDataMerged["LOCTYP"] == "1"]
print("Removing "+str(len(OPMDataMerged[OPMDataMerged["OCCTYP"] == "3"]))+" observations with no specified Occupation.")
## Remove Observations with no specified occupation
OPMDataMerged = OPMDataMerged[OPMDataMerged["OCCTYP"] != "3"]
print("Removing "+str(len(OPMDataMerged[OPMDataMerged["SALLVL"] == "Z"]))+" observations with no specified Salary.")
## Remove Observations with no specified salary
OPMDataMerged = OPMDataMerged[OPMDataMerged["SALLVL"] != "Z"]
print("Removing "+str(len(OPMDataMerged[OPMDataMerged["LOSLVL"] == "Z"]))+" observations with no specified Length of Service.")
## Remove Observations with no specified LOSLVL
OPMDataMerged = OPMDataMerged[OPMDataMerged["LOSLVL"] != "Z"]
print("Removing "+str(len(OPMDataMerged[OPMDataMerged["AGELVL"] == "A"]))+" observations of Age Level A")
## Remove Observations from Age Level A (less than 20 years old)
OPMDataMerged = OPMDataMerged[OPMDataMerged["AGELVL"] != "A"]
print("Removing "+str(len(OPMDataMerged[OPMDataMerged["AGELVL"] == "Z"]))+" observations with no specified Age Level.")
## Remove Observations with no specified Age Level
OPMDataMerged = OPMDataMerged[OPMDataMerged["AGELVL"] != "Z"]
## Fix differences in spaces on WORKSCHT Column
OPMDataMerged["WORKSCHT"] = np.where(OPMDataMerged["WORKSCHT"].str[0]=="F", 'Full-time Nonseasonal',
np.where(OPMDataMerged["WORKSCHT"].str[0]=="I", 'Intermittent Nonseasonal',
np.where(OPMDataMerged["WORKSCHT"].str[0]=="P", 'Part-time Nonseasonal',
np.where(OPMDataMerged["WORKSCHT"].str[0]=="G", 'Full-time Seasonal',
np.where(OPMDataMerged["WORKSCHT"].str[0]=="J", 'Intermittent Seasonal',
np.where(OPMDataMerged["WORKSCHT"].str[0]=="Q", 'Part-time Seasonal',
np.where(OPMDataMerged["WORKSCHT"].str[0]=="T", 'Part-time Job Sharer Seasonal',
np.where(OPMDataMerged["WORKSCHT"].str[0]=="S", 'Part-time Job Sharer Nonseasonal',
np.where(OPMDataMerged["WORKSCHT"].str[0]=="B", 'Full-time Nonseasonal Baylor Plan',
'NO WORK SCHEDULE REPORTED' ### ELSE case represents Night
)
)
)
)
)
)
)
)
)
display(OPMDataMerged.head())
print("New SEP data size of: "+str(len(OPMDataMerged)))
display(OPMDataMerged.describe().transpose())
#del OPMDataList,OPMDataFiles
%%time
if os.path.isfile(PickleJarPath+"/EMPDataOrig4Q.pkl"):
print("Found the File! Loading Pickle Now!")
EMPDataOrig4Q = unpickleObject("EMPDataOrig4Q")
else:
## Load EMPData Files
indexes = []
EMPDataFiles = []
EMPDataList = []
EMPDataOrig = []
for i,qtr in enumerate(["Q1", "Q2", "Q3", "Q4"]):
EMPDataFiles.append(glob.glob(os.path.join(dataEMPPath, qtr + "/*.txt")))
for j in range(0,len(EMPDataFiles[i])):
EMPDataFiles[i][j] = EMPDataFiles[i][j].replace("\\","/")
EMPDataList.append([])
for j,file in enumerate(EMPDataFiles[i]):
EMPDataList[i].append(pd.read_csv(file, dtype = 'str'))
if i == 0:
display(EMPDataList[i][j].head())
## Load the FactData files into it's own object
indexes.append([])
##[qtr][fileindex from EMPDataList]
indexes[i]=[j for j,x in enumerate(EMPDataFiles[i]) if dataEMPPath + '/' + qtr + '/FACTDATA' in x]
EMPDataOrig.append([])
EMPDataOrig[i] = pd.concat([EMPDataList[i][indexes[i][j]] for j in range(0,len(indexes[i]))])
EMPDataOrig[i]["QTR"] = str(i+1)
## modify data type for numerics
EMPDataOrig[i]["SALARY"] = EMPDataOrig[i]["SALARY"].str.replace(',', '').str.replace('$', '').str.replace(' ', '').apply(pd.to_numeric)
## Load Metadata
##AGYSUB - AGYTYP, AGY
ind2 = [i for i,x in enumerate(EMPDataFiles[i]) if x == dataEMPPath + '/' + qtr + '/DTagy.txt']
EMPDataOrig[i] = EMPDataOrig[i].merge(EMPDataList[i][ind2[0]], on = 'AGYSUB', how = 'left')
##AGELVL - AGELVLT
ind2 = [i for i,x in enumerate(EMPDataFiles[i]) if x == dataEMPPath + '/' + qtr + '/DTagelvl.txt']
EMPDataOrig[i] = EMPDataOrig[i].merge(EMPDataList[i][ind2[0]], on = 'AGELVL', how = 'left')
#LOSLVL - LOSLVLT
ind2 = [i for i,x in enumerate(EMPDataFiles[i]) if x == dataEMPPath + '/' + qtr + '/DTloslvl.txt']
EMPDataOrig[i] = EMPDataOrig[i].merge(EMPDataList[i][ind2[0]], on = 'LOSLVL', how = 'left')
EMPDataOrig[i]["LOS"] = EMPDataOrig[i]["LOS"].apply(pd.to_numeric)
##LOC - LocTypeT, LocT
ind2 = [i for i,x in enumerate(EMPDataFiles[i]) if x == dataEMPPath + '/' + qtr + '/DTloc.txt']
EMPDataOrig[i] = EMPDataOrig[i].merge(EMPDataList[i][ind2[0]], on = 'LOC', how = 'left')
##OCC - OCCTYPT, OCCFAM
ind2 = [i for i,x in enumerate(EMPDataFiles[i]) if x == dataEMPPath + '/' + qtr + '/DTocc.txt']
EMPDataOrig[i] = EMPDataOrig[i].merge(EMPDataList[i][ind2[0]], on = 'OCC', how = 'left')
##PATCO - PATCOT
ind2 = [i for i,x in enumerate(EMPDataFiles[i]) if x == dataEMPPath + '/' + qtr + '/DTpatco.txt']
EMPDataOrig[i] = EMPDataOrig[i].merge(EMPDataList[i][ind2[0]], on = 'PATCO', how = 'left')
##PPGRD - PayPlan, PPGroup, PPTYP
ind2 = [i for i,x in enumerate(EMPDataFiles[i]) if x == dataEMPPath + '/' + qtr + '/DTppgrd.txt']
EMPDataOrig[i] = EMPDataOrig[i].merge(EMPDataList[i][ind2[0]], on = 'PPGRD', how = 'left')
##SALLVL - SALLVLT
ind2 = [i for i,x in enumerate(EMPDataFiles[i]) if x == dataEMPPath + '/' + qtr + '/DTsallvl.txt']
EMPDataOrig[i] = EMPDataOrig[i].merge(EMPDataList[i][ind2[0]], on = 'SALLVL', how = 'left')
##TOA - TOATYP
ind2 = [i for i,x in enumerate(EMPDataFiles[i]) if x == dataEMPPath + '/' + qtr + '/DTtoa.txt']
EMPDataOrig[i] = EMPDataOrig[i].merge(EMPDataList[i][ind2[0]], on = 'TOA', how = 'left')
##WORKSCH - WSTYPT
ind2 = [i for i,x in enumerate(EMPDataFiles[i]) if x == dataEMPPath + '/' + qtr + '/DTwrksch.txt']
EMPDataOrig[i] = EMPDataOrig[i].merge(EMPDataList[i][ind2[0]], on = 'WORKSCH', how = 'left')
display(EMPDataOrig[i].head())
EMPDataOrig4Q = pd.concat([EMPDataOrig[j] for j in range(0,len(EMPDataOrig))])
print("Original EMP data size of: "+str(len(EMPDataOrig4Q)))
print("Removing "+str(len(EMPDataOrig4Q[EMPDataOrig4Q["LOCTYP"] != "1"]))+" Non-US observations.")
## Remove Non-US Data
EMPDataOrig4Q = EMPDataOrig4Q[EMPDataOrig4Q["LOCTYP"] == "1"]
print("Removing "+str(len(EMPDataOrig4Q[EMPDataOrig4Q["OCCTYP"] == "3"]))+" observations with no specified Occupation.")
## Remove Observations with no specified occupation
EMPDataOrig4Q = EMPDataOrig4Q[EMPDataOrig4Q["OCCTYP"] != "3"]
print("Removing "+str(len(EMPDataOrig4Q[EMPDataOrig4Q["SALLVL"] == "Z"]))+" observations with no specified Salary.")
## Remove Observations with no specified salary
EMPDataOrig4Q = EMPDataOrig4Q[EMPDataOrig4Q["SALLVL"] != "Z"]
print("Removing "+str(len(EMPDataOrig4Q[EMPDataOrig4Q["LOSLVL"] == "Z"]))+" observations with no specified Length of Service.")
## Remove Observations with no specified LOSLVL
EMPDataOrig4Q = EMPDataOrig4Q[EMPDataOrig4Q["LOSLVL"] != "Z"]
print("Removing "+str(len(EMPDataOrig4Q[EMPDataOrig4Q["AGELVL"] == "A"]))+" observations of Age Level A.")
## Remove Observations from Age Level A (less than 20 years old)
EMPDataOrig4Q = EMPDataOrig4Q[EMPDataOrig4Q["AGELVL"] != "A"]
print("Removing "+str(len(EMPDataOrig4Q[EMPDataOrig4Q["AGELVL"] == "Z"]))+" observations with no specified Age Level.")
## Remove Observations with no specified Age Level
EMPDataOrig4Q = EMPDataOrig4Q[EMPDataOrig4Q["AGELVL"] != "Z"]
## Fix differences in spaces on WORKSCHT Column
EMPDataOrig4Q["WORKSCHT"] = np.where(EMPDataOrig4Q["WORKSCHT"].str[0]=="F", 'Full-time Nonseasonal',
np.where(EMPDataOrig4Q["WORKSCHT"].str[0]=="I", 'Intermittent Nonseasonal',
np.where(EMPDataOrig4Q["WORKSCHT"].str[0]=="P", 'Part-time Nonseasonal',
np.where(EMPDataOrig4Q["WORKSCHT"].str[0]=="G", 'Full-time Seasonal',
np.where(EMPDataOrig4Q["WORKSCHT"].str[0]=="J", 'Intermittent Seasonal',
np.where(EMPDataOrig4Q["WORKSCHT"].str[0]=="Q", 'Part-time Seasonal',
np.where(EMPDataOrig4Q["WORKSCHT"].str[0]=="T", 'Part-time Job Sharer Seasonal',
np.where(EMPDataOrig4Q["WORKSCHT"].str[0]=="S", 'Part-time Job Sharer Nonseasonal',
np.where(EMPDataOrig4Q["WORKSCHT"].str[0]=="B", 'Full-time Nonseasonal Baylor Plan',
'NO WORK SCHEDULE REPORTED' ### ELSE case represents Night
)
)
)
)
)
)
)
)
)
pickleObject(EMPDataOrig4Q, "EMPDataOrig4Q")
print("New EMP data size of: "+str(len(EMPDataOrig4Q)))
display(EMPDataOrig4Q.describe().transpose())
%matplotlib inline
#sns.boxplot(y = "SALARY", data = EMPDataOrig4Q)
With both our separation and non-separation data loaded, we calculate three new attributes through aggregation or calculation amongst various attributes.
1) SEP Count by Date & Occupation – total number of separations (of any type) for a given Date and Occupation;
2) SEP Count by Date & Location – total number of separations (of any type) for a given Date and Location;
3) Industry Average Salary – Average salary amongst non-separated employees, grouped by quarter, occupation, pay grade, and work schedule;
We proceed, by concatenating our Separation and Non-Separation observations, and merge these newly calculated attributes to the concatenated dataset.
%%time
%matplotlib inline
##Aggregate Number of Total Separations in current month for given Occ
AggSEPCount_EFDATE_OCC= pd.DataFrame({'SEPCount_EFDATE_OCC' : OPMDataMerged.groupby(["EFDATE", "OCC"]).size()}).reset_index()
display(AggSEPCount_EFDATE_OCC.head())
##Aggregate Number of Total Separations in current month for given LOC
AggSEPCount_EFDATE_LOC = pd.DataFrame({'SEPCount_EFDATE_LOC' : OPMDataMerged.groupby(["EFDATE", "LOC"]).size()}).reset_index()
display(AggSEPCount_EFDATE_LOC.head())
##Average Quarterly EMP Salary by occ
AggIndAvgSalary = pd.DataFrame({'count' : EMPDataOrig4Q.groupby(["QTR", "OCC", "PPGRD", "WORKSCHT"]).size()}).reset_index()
AggIndAvgSalary2 = pd.DataFrame({'IndSalarySum' : EMPDataOrig4Q.groupby(["QTR", "OCC", "PPGRD", "WORKSCHT"])["SALARY"].sum()}).reset_index()
AggIndAvgSalary = AggIndAvgSalary.merge(AggIndAvgSalary2,on=["QTR", "OCC", "PPGRD", "WORKSCHT"])
AggIndAvgSalary["IndAvgSalary"] = AggIndAvgSalary["IndSalarySum"]/AggIndAvgSalary["count"]
del AggIndAvgSalary["count"]
del AggIndAvgSalary["IndSalarySum"]
display(AggIndAvgSalary.head())
#Merge Two Datasets
### NS SEP code means NonSeparation
###add hardcoded null value columns where applicable
EMPDataOrig4Q["SEP"] = "NS"
EMPDataOrig4Q["GENDER"] = np.nan
EMPDataOrig4Q["COUNT"] = np.nan
OPMDataMerged["DATECODE"] = OPMDataMerged["EFDATE"]
OPMColList = ["AGYSUB", "SEP", "DATECODE", "AGELVL", "GENDER", "GSEGRD", "LOSLVL", "LOC", "OCC", "PATCO", "PPGRD", "SALLVL", "TOA", "WORKSCH", "COUNT", "SALARY", "LOS", "AGYTYP", "AGYTYPT", "AGY", "AGYT", "AGYSUBT", "QTR", "AGELVLT", "LOSLVLT", "LOCTYP", "LOCTYPT", "LOCT", "OCCTYP", "OCCTYPT", "OCCFAM", "OCCFAMT", "OCCT", "PATCOT", "PPTYP", "PPTYPT", "PPGROUP", "PPGROUPT", "PAYPLAN", "PAYPLANT", "SALLVLT", "TOATYP", "TOATYPT", "TOAT", "WSTYP", "WSTYPT", "WORKSCHT"]
EMPColList = ["AGYSUB", "SEP", "DATECODE", "AGELVL", "GENDER", "GSEGRD", "LOSLVL", "LOC", "OCC", "PATCO", "PPGRD", "SALLVL", "TOA", "WORKSCH", "COUNT", "SALARY", "LOS", "AGYTYP", "AGYTYPT", "AGY", "AGYT", "AGYSUBT", "QTR", "AGELVLT", "LOSLVLT", "LOCTYP", "LOCTYPT", "LOCT", "OCCTYP", "OCCTYPT", "OCCFAM", "OCCFAMT", "OCCT", "PATCOT", "PPTYP", "PPTYPT", "PPGROUP", "PPGROUPT", "PAYPLAN", "PAYPLANT", "SALLVLT", "TOATYP", "TOATYPT", "TOAT", "WSTYP", "WSTYPT", "WORKSCHT"]
OPMDataMerged = pd.concat([OPMDataMerged[OPMColList], EMPDataOrig4Q[EMPColList]], ignore_index=True)
print("Total concatenated data size for SEP and non-SEP: "+str(len(OPMDataMerged)))
OPMDataMerged = OPMDataMerged.merge(AggSEPCount_EFDATE_OCC, left_on = ['DATECODE','OCC'], right_on = ['EFDATE','OCC'], how = 'left')
OPMDataMerged = OPMDataMerged.merge(AggSEPCount_EFDATE_LOC, left_on = ['DATECODE','LOC'], right_on = ['EFDATE','LOC'], how = 'left')
OPMDataMerged = OPMDataMerged.merge(AggIndAvgSalary, on = ['QTR','OCC', 'PPGRD', 'WORKSCHT'], how = 'left')
OPMDataMerged["SalaryOverUnderIndAvg"] = OPMDataMerged["SALARY"] - OPMDataMerged["IndAvgSalary"]
del OPMDataMerged["EFDATE_x"]
del OPMDataMerged["EFDATE_y"]
display(OPMDataMerged.head())
display(OPMDataMerged.tail())
print(len(OPMDataMerged[OPMDataMerged["SEPCount_EFDATE_OCC"].isnull()]))
display(OPMDataMerged[OPMDataMerged["SEPCount_EFDATE_OCC"].isnull()][["SEP","DATECODE", "OCC"]].drop_duplicates())
These 50993 Non-Separation observations do not have coverage within the Separation Dataset, thus, we will remove these observations as out of scope demographic in our analysis. Any attempt in predicting these values will not have enough data to support a significant response.
OPMDataMerged = OPMDataMerged[OPMDataMerged["SEPCount_EFDATE_OCC"].notnull()]
print(len(OPMDataMerged[OPMDataMerged["SEPCount_EFDATE_OCC"].isnull()]))
print(len(OPMDataMerged))
print(len(OPMDataMerged[OPMDataMerged["SEPCount_EFDATE_LOC"].isnull()]))
display(OPMDataMerged[OPMDataMerged["SEPCount_EFDATE_LOC"].isnull()][["SEP","DATECODE","LOC"]].drop_duplicates())
print(len(OPMDataMerged[OPMDataMerged["IndAvgSalary"].isnull()]))
display(OPMDataMerged[OPMDataMerged["IndAvgSalary"].isnull()][["QTR", "SEP","OCCT", "PPGRD", "WORKSCHT"]].drop_duplicates())
These 1293 separation observations do not have coverage within the EMP Dataset, thus, we will remove these observations as out of scope demographic in our analysis. Any attempt in predicting these values will not have enough data to support a significant response.
OPMDataMerged = OPMDataMerged[OPMDataMerged["IndAvgSalary"].notnull()]
print(len(OPMDataMerged[OPMDataMerged["IndAvgSalary"].isnull()]))
print(len(OPMDataMerged))
# Placeholder Chunks for Data Quality check of salary against GS Grade Level Ranges
We are iterested to see how federal pension plans may impact attrition in this dataset. An interesting attribute to complement Length of service, is Years to Retirement. Utilizing a FERS retirement eligibility baseline of 57 years of age for all observations, and the lower limitation of age level ranges we compute a numeric value for length of retirement.
#Add Column YearsToRetirement
"""
AGELVL,AGELVLT
A,Less than 20
B,20-24
C,25-29
D,30-34
E,35-39
F,40-44
G,45-49
H,50-54
I,55-59
J,60-64
K,65 or more
Z,Unspecified
"""
OPMDataMerged["LowerLimitAge"] = np.where(OPMDataMerged["AGELVL"]=="B", 20,
np.where(OPMDataMerged["AGELVL"]=="C", 25,
np.where(OPMDataMerged["AGELVL"]=="D", 30,
np.where(OPMDataMerged["AGELVL"]=="E", 35,
np.where(OPMDataMerged["AGELVL"]=="F", 40,
np.where(OPMDataMerged["AGELVL"]=="G", 45,
np.where(OPMDataMerged["AGELVL"]=="H", 50,
np.where(OPMDataMerged["AGELVL"]=="I", 55,
np.where(OPMDataMerged["AGELVL"]=="J", 60,
np.where(OPMDataMerged["AGELVL"]=="K", 65,
np.nan
)
)
)
)
)
)
)
)
)
)
retAge = 57
OPMDataMerged["YearsToRetirement"] = np.where(OPMDataMerged["AGELVL"]=="B", retAge-20,
np.where(OPMDataMerged["AGELVL"]=="C", retAge-25,
np.where(OPMDataMerged["AGELVL"]=="D", retAge-30,
np.where(OPMDataMerged["AGELVL"]=="E", retAge-35,
np.where(OPMDataMerged["AGELVL"]=="F", retAge-40,
np.where(OPMDataMerged["AGELVL"]=="G", retAge-45,
np.where(OPMDataMerged["AGELVL"]=="H", retAge-50,
np.where(OPMDataMerged["AGELVL"]=="I", retAge-55,
np.where(OPMDataMerged["AGELVL"]=="J", retAge-60,
np.where(OPMDataMerged["AGELVL"]=="K", retAge-65,
np.nan
)
)
)
)
)
)
)
)
)
)
print("Null Values for LowerLimitAge: " + str(len(OPMDataMerged[OPMDataMerged["LowerLimitAge"].isnull()])))
print("Null Values for YearsToRetirement: " + str(len(OPMDataMerged[OPMDataMerged["YearsToRetirement"].isnull()])))
display(OPMDataMerged.head())
display(OPMDataMerged.tail())
In addition to the OPM data, we merge 10 attributes from the Bureau of Labor Statistics (BLS). Data is sourced from Federal Government industry codes across all regions. Although assumed to be highly correlated, we source both Level (Total number) and Rate (Percentage of Level to total employment and / or job openings) for the following statistics: 1) Job Openings, 2) Layoffs, 3) Quits, 4) Total Separations, and 5) Other Separations. While Rate paints an aggregated, holistic picture for job market trends, Level provides a raw count for total separations alone. Both these statistics were captured by a monthly aggregate and merged to the OPM data by their respective months.
%%time
def bls(series, start, end):
headers = {'Content-type': 'application/json'}
sID = []
for i in range(0,len(series)):
sID.append(series[i][0])
data = json.dumps({"seriesid": sID,
"startyear":start,
"endyear":end,
"catalog":False,
"calculations":False,
"annualaverage":False,
"registrationkey":"7a89c8d7979349fba8914b8be16a1646"})
p = requests.post('https://api.bls.gov/publicAPI/v2/timeseries/data/', data=data, headers=headers)
json_data = json.loads(p.text)
bls = []
for series in json_data['Results']['series']:
#x=prettytable.PrettyTable(["series id","year","period","value","footnotes"])
result = pd.DataFrame(columns=["series id","year","period","value","footnotes"])
seriesId = series['seriesID']
for item in series['data']:
year = item['year']
period = item['period']
value = item['value']
footnotes=""
for footnote in item['footnotes']:
if footnote:
footnotes = footnotes + footnote['text'] + ','
if 'M01' <= period <= 'M12':
#x.add_row([seriesId,year,period,value,footnotes[0:-1]])
y = pd.DataFrame({"series id" : seriesId,
"year" : year,
"period" : period,
"value" : value,
"footnotes" : footnotes}, index = [0])
result = result.append(y, ignore_index = True)
bls.append(result)
return(bls)
%%time
seriesList = [
['JTU91000000JOL','BLS_FEDERAL_JobOpenings_Level'],
['JTU91000000LDL','BLS_FEDERAL_Layoffs_Level'],
['JTU91000000OSL','BLS_FEDERAL_OtherSep_Level'],
['JTU91000000QUL','BLS_FEDERAL_Quits_Level'],
['JTU91000000TSL','BLS_FEDERAL_TotalSep_Level'],
['JTU91000000JOR','BLS_FEDERAL_JobOpenings_Rate'],
['JTU91000000LDR','BLS_FEDERAL_Layoffs_Rate'],
['JTU91000000OSR','BLS_FEDERAL_OtherSep_Rate'],
['JTU91000000QUR','BLS_FEDERAL_Quits_Rate'],
['JTU91000000TSR','BLS_FEDERAL_TotalSep_Rate']
]
# Pull job openings and labor turnover data
JTL = bls(seriesList, "2014", "2015")
seriesList = pd.DataFrame(seriesList, columns = ["series id","sName"])
##We need to replace these with actual Descriptor Column Names
for i in range(0,len(seriesList)):
JTL[i] = JTL[i].merge(seriesList, on = "series id", how = 'inner')
if len(JTL[i]) >0:
name = JTL[i]["sName"].drop_duplicates().values[0]
else:
name = str(i)
JTL[i][name] = JTL[i]["value"].apply(pd.to_numeric)
JTL[i]["DATECODE"] = JTL[i]["year"] + JTL[i]["period"].str[-2:]
del JTL[i]["value"]
del JTL[i]["year"]
del JTL[i]["period"]
del JTL[i]["series id"]
del JTL[i]["footnotes"]
del JTL[i]["sName"]
OPMDataMerged = OPMDataMerged.merge(JTL[i], on = "DATECODE", how = 'left')
display(JTL[i].head())
display(OPMDataMerged.head())
display(OPMDataMerged.tail())
display(pd.DataFrame({'StratCount' : OPMDataMerged.groupby(["SEP"]).size()}).reset_index())
In terms of data exploration, we first investigate numeric type attributes. Relationships, distributions, and correlation values are reviewed.
A new binary separation attribute is created to indicate whether non-sep or sep for EDA correlation purposes
#%%time
#
#
#cols = list(SampledOPMData.select_dtypes(include=['float64', 'int64']))
#cols.remove('COUNT')
#cols.remove('BLS_FEDERAL_OtherSep_Rate')
#cols.remove('BLS_FEDERAL_Quits_Rate')
#cols.remove('BLS_FEDERAL_TotalSep_Level')
#cols.remove('BLS_FEDERAL_JobOpenings_Rate')
#cols.remove('BLS_FEDERAL_OtherSep_Level')
#cols.remove('BLS_FEDERAL_Quits_Level')
#cols.remove('BLS_FEDERAL_JobOpenings_Level')
#cols.remove('BLS_FEDERAL_Layoffs_Rate')
#cols.remove('BLS_FEDERAL_Layoffs_Level')
#cols.remove('BLS_FEDERAL_TotalSep_Rate')
#cols.append('SEP')
#display(cols)
#
#plotNumeric = SampledOPMData[cols]
#
## Create binary separation attribute for EDA correlation review
##plotNumeric["SEP_bin"] = plotNumeric.SEP.replace("NS", 1)
##plotNumeric.loc[plotNumeric['SEP_bin'] != 1, 'SEP_bin'] = 0
##plotNumeric.SEP_bin = plotNumeric.SEP_bin.apply(pd.to_numeric)
#AttSplit = pd.get_dummies(plotNumeric['SEP'],prefix='SEP')
#display(AttSplit.head())
#plotNumeric = pd.concat((plotNumeric,AttSplit),axis=1) # add back into the dataframe
#
#display(plotNumeric.head())
#print("plotNumeric has {0} Records".format(len(plotNumeric)))
##print(plotNumeric.SEP_bin.dtype)
%%time
cols = list(OPMDataMerged.select_dtypes(include=['float64', 'int64']))
cols.remove('COUNT')
cols.remove('BLS_FEDERAL_OtherSep_Rate')
cols.remove('BLS_FEDERAL_Quits_Rate')
cols.remove('BLS_FEDERAL_TotalSep_Level')
cols.remove('BLS_FEDERAL_JobOpenings_Rate')
cols.remove('BLS_FEDERAL_OtherSep_Level')
cols.remove('BLS_FEDERAL_Quits_Level')
cols.remove('BLS_FEDERAL_JobOpenings_Level')
cols.remove('BLS_FEDERAL_Layoffs_Rate')
cols.remove('BLS_FEDERAL_Layoffs_Level')
cols.remove('BLS_FEDERAL_TotalSep_Rate')
cols.append('SEP')
display(cols)
plotNumeric = OPMDataMerged[cols]
# Create binary separation attribute for EDA correlation review
#plotNumeric["SEP_bin"] = plotNumeric.SEP.replace("NS", 1)
#plotNumeric.loc[plotNumeric['SEP_bin'] != 1, 'SEP_bin'] = 0
#plotNumeric.SEP_bin = plotNumeric.SEP_bin.apply(pd.to_numeric)
AttSplit = pd.get_dummies(plotNumeric['SEP'],prefix='SEP')
display(AttSplit.head())
plotNumeric = pd.concat((plotNumeric,AttSplit),axis=1) # add back into the dataframe
display(plotNumeric.head())
print("plotNumeric has {0} Records".format(len(plotNumeric)))
#print(plotNumeric.SEP_bin.dtype)
#%%time
#
#sns.set(font_scale=1)
#sns.pairplot(plotNumeric.drop(["SEP_NS",
# "SEP_SA",
# "SEP_SB",
# "SEP_SC",
# "SEP_SD",
# "SEP_SE",
# "SEP_SF",
# "SEP_SG",
# "SEP_SH",
# "SEP_SI",
# "SEP_SJ",
# "SEP_SK",
# "SEP_SL"
# ], axis=1), hue = 'SEP', palette="hls", plot_kws={"s": 50})
%%time
# Function modified from https://stackoverflow.com/questions/29530355/plotting-multiple-histograms-in-grid
sns.set()
def draw_histograms(df, variables, n_rows, n_cols):
fig=plt.figure(figsize=(20,20))
for i, var_name in enumerate(variables):
ax=fig.add_subplot(n_rows,n_cols,i+1)
df[var_name].hist(bins=20,ax=ax, color='#58D68D')
ax.set_title(var_name+" Distribution")
fig.tight_layout() # Improves appearance a bit.
plt.show()
draw_histograms(plotNumeric.drop(['SEP',
"SEP_NS",
"SEP_SA",
"SEP_SB",
"SEP_SC",
"SEP_SD",
"SEP_SE",
"SEP_SF",
"SEP_SG",
"SEP_SH",
"SEP_SI",
"SEP_SJ",
"SEP_SK",
"SEP_SL"
], axis=1),
plotNumeric.drop(['SEP',
"SEP_NS",
"SEP_SA",
"SEP_SB",
"SEP_SC",
"SEP_SD",
"SEP_SE",
"SEP_SF",
"SEP_SG",
"SEP_SH",
"SEP_SI",
"SEP_SJ",
"SEP_SK",
"SEP_SL"
], axis=1).columns, 6, 3)
%%time
# Inspired by http://seaborn.pydata.org/examples/many_pairwise_correlations.html
#plt.matshow(plotNumeric.corr())
sns.set(style='white')
corr = plotNumeric.drop(['SEP'], axis=1).corr()
# Generate a mask for the upper triangle
mask = np.zeros_like(corr, dtype=np.bool)
mask[np.triu_indices_from(mask, k=1)] = True
# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(20, 20))
# Generate a custom diverging colormap
cmap = sns.diverging_palette(250, 10, as_cmap=True)
# Draw the heatmap with the mask and correct aspect ratio
sns.set(font_scale=0.95)
heatCorr = sns.heatmap(corr, mask=mask, cmap=cmap, vmax=1, vmin=-1,
square=True, annot=True, linewidths=1,
cbar_kws={"shrink": .5}, ax=ax, fmt='.1g')
#heatCorr.
ax.tick_params(labelsize=15)
cax = plt.gcf().axes[-1]
cax.tick_params(labelsize=15)
sns.plt.show()
#sns.heatmap(corr, annot=True, linewidths=0.01, cmap=cmap, ax=ax)
Based on the distribution of attributes identified above, we have decided to take the log transform of several attributes.
%%time
# Log Transform Columns Added
OPMDataMerged["SALARYLog"] = OPMDataMerged["SALARY"].apply(np.log)
#OPMDataMerged["LOSLog"] = (OPMDataMerged["LOS"] + .00001).apply(np.log)
OPMDataMerged["LOSSqrt"] = (OPMDataMerged["LOS"]).apply(np.sqrt)
OPMDataMerged["SEPCount_EFDATE_OCCLog"] = OPMDataMerged["SEPCount_EFDATE_OCC"].apply(np.log)
OPMDataMerged["SEPCount_EFDATE_LOCLog"] = OPMDataMerged["SEPCount_EFDATE_LOC"].apply(np.log)
OPMDataMerged["IndAvgSalaryLog"] = OPMDataMerged["IndAvgSalary"].apply(np.log)
We next review categorical data to improve our understanding of factor levels.
#%%time
# "LOCTYPT",
# "OCCTYP",
# "OCCTYPT",
# "PPTYP",
# "PPTYPT",
# "AGYTYP",
# "OCCFAM",
# "PPGROUP",
# "PAYPLAN",
# "TOATYP",
# "WSTYP",
# "AGYSUBT",
# "AGELVL",
# "LOSLVL",
# "LOC",
# "OCC",
# "PATCO",
# "SALLVL",
# "TOA",
# "WORKSCH"]
#
#for i in dropCols:
# cols.remove(i)
#
#plotCat = SampledOPMData[cols]
#display(plotCat.head())
#print("plotCat Has {0} Records".format(len(plotCat)))
#print("Number of colums = ", len(cols))
%%time
cols = list(OPMDataMerged.select_dtypes(include=['object']))
dropCols = ["LOCTYP",
"LOCTYPT",
"OCCTYP",
"OCCTYPT",
"PPTYP",
"PPTYPT",
"AGYTYP",
"OCCFAM",
"PPGROUP",
"PAYPLAN",
"TOATYP",
"WSTYP",
"AGYSUBT",
"AGELVL",
"LOSLVL",
"LOC",
"OCC",
"PATCO",
"SALLVL",
"TOA",
"WORKSCH"]
for i in dropCols:
cols.remove(i)
plotCat = OPMDataMerged[cols]
display(plotCat.head())
print("plotCat Has {0} Records".format(len(plotCat)))
print("Number of colums = ", len(cols))
High seperation among following:
Similar separation distributions among males and females, except more terminations due to contract expiration among males
High termination due to expired appt/other among following:
Bimodal Quit distribution with outlier spike at GSEGRD 9:
Individual transfers highest among levels 11, 12, 13
Majority of distribution resides in GS values per the GSEGRD observations described above.... Are other PPGRD values of any significance? What are corporate grades all about?
Top three Agencies with separation:
High contract termination in:
While Veteran Affairs and Army both have many quits and many retirees, the Army has significantly more individual transfers (on par with retirements)
Most contract terminations in 1st and 4th quarters
Retirement peaks in 2nd quarter
Number of quits increases from one quarter to the next
*Bear in mind these are quarters from single year only so time-sensitive trends may not be applicable*
High termination due to expired appt/other among following:
Number of Quits peaks at AGELVL D
Individual transfer counts mostly trend with Quits
Retirement highest at following:
Highest Quit count for LOSLVL A (< 1 year service) which then declines for levels B and C before spiking again at level D (5-9 years service)
Same pattern is observed for contract terminations but without any significant spikes with longer service
Large individual transfer spike at LOSLVL D (5-9 years service)
Retirement starts at LOSLVL D but trends upward to J
Contract terminations comprise most California terminations among top total separation states
East Coast locations may possibly have most individual transfers, the most being in Washington DC
03xx-General Admin, clerical, and office svcs highest separation by far but indicates both high number of Quits and Retirements
Many quits in 06xx-Medical
04xx-Natural Resources again indicates high number of contract terminations
01xx-Social Science has even number of Quits and retirements
Results skewed by GS
Should model full time only
def subCountPlot(att1, att2, thresh):
counts = plotCat.groupby([att1, att2]).size().unstack(fill_value=0) # Get att1 sizes by att2
counts = pd.concat([counts,counts.sum(axis=1)], axis=1) # Calculate total for each att1 value and append total as new column
counts.rename(columns={0:"Total"}, inplace=True)
top = counts[counts["Total"] > thresh].index.tolist() # Obtain att1 values where total surpasses threshold
zoom = plotCat[plotCat[att1].isin(top)] # Subset data to only the top att1 values
f, (ax1, ax2) = plt.subplots(ncols=2, figsize=(20, 10), sharey=False)
sns.countplot(y=att1, data=zoom, color="blue", ax=ax1); # Dark blue signifies zoomed data
sns.countplot(y=att1, data=zoom, hue=att2, palette="hls", ax=ax2);
def percBarPlot(att1, att2, numColors):
# Create count by att1 and att2
counts = plotCat.groupby([att1, att2]).size().unstack(fill_value=0) # Get att1 sizes by att2
counts = pd.concat([counts,counts.sum(axis=1)], axis=1) # Calculate total for each att1 value and append total as new column
counts.rename(columns={0:"Total"}, inplace=True)
#counts.drop('Total', axis=1).plot(kind='bar', stacked=True)
# create cmap from sns color palette
my_cmap = ListedColormap(sns.color_palette('hls', numColors).as_hex())
# Create and plot percentage by att1 and att2
nest1 = []
for i in counts.values:
nest2 = []
for j in i:
nest2.append(float(j/(i[len(i)-1:]))*100)
nest1.append(nest2)
perc = pd.DataFrame(nest1)
perc = perc.set_index(counts.index.values)
perc.columns = counts.columns
perc.drop('Total', axis=1).plot(kind='bar', stacked=True, ylim=(0,100), figsize={13,6}, title=att1+' Percentage Plot', colormap=my_cmap)
temp = cols[:4] # for quick visualization debug only; may delete once complete
%%time
for i in cols:
if i != 'SEP':
plt.figure(i) # Required to create new figure each loop rather than drawing over previous object
f, (ax1, ax2) = plt.subplots(ncols=2, figsize=(20, 10), sharey=False)
sns.countplot(y=i, data=plotCat, color="lightblue", ax=ax1);
sns.countplot(y=i, data=plotCat, hue="SEP", palette="hls", ax=ax2);
if i == 'AGYSUB':
subCountPlot(i, 'SEP', 10000)
elif i == 'LOCT':
subCountPlot(i, 'SEP', 4000)
elif i == 'OCCT':
subCountPlot(i, 'SEP', 2000)
elif i == 'PPGRD':
subCountPlot(i, 'SEP', 6000)
elif i == 'AGYT':
subCountPlot(i, 'SEP', 3000)
%%time
for i in cols:
if i != 'SEP':
percBarPlot(i, 'SEP', len(plotCat.SEP.drop_duplicates()))
percBarPlot('GSEGRD', 'SALLVLT', len(plotCat.SALLVLT.drop_duplicates()))
percBarPlot('PATCOT', 'SALLVLT', len(plotCat.SALLVLT.drop_duplicates()))
%%time
sns.set(style="whitegrid", palette="pastel", color_codes=True)
sns.violinplot(x="PATCOT", y="SALARY", hue="GENDER", data=OPMDataMerged[OPMDataMerged.GENDER != 'Z'], split=True,
inner="quart", palette={"M": "b", "F": "pink"})
sns.despine(left=True)
%%time
# Draw a nested violinplot and split the violins for easier comparison
sns.violinplot(x="SEP", y="SALARY", hue="GENDER", data=OPMDataMerged[OPMDataMerged.GENDER != 'Z'], split=True,
inner="quart", palette={"M": "b", "F": "pink"})
sns.despine(left=True)
%%time
sns.factorplot(x="SEP", y="SALARY", hue="GENDER", col="PATCOT",
data=OPMDataMerged[OPMDataMerged.GENDER != 'Z'],
kind="violin", split=True, aspect=.4, size=10);
#%%time
#
#sns.factorplot(x="SEP", y="SALARY", col="PATCOT", data=OPMDataMerged,
# kind="violin", split=True, aspect=.4, size=10, palette = "hls");
#%%time
#
#g = sns.PairGrid(data=OPMDataMerged,
# x_vars=["SEP","PATCOT"],
# y_vars=["SALARY", "LOS", "LowerLimitAge", "YearsToRetirement"],
# aspect=1, size=10)
#g.map(sns.violinplot, palette="pastel");
del(plotNumeric, plotCat)
After analyzing the above plots for our categorical data, we have decided to narrow our focus due to the large variability in the dataset. We take the below actions on our dataset:
In addition, we have opted to remove the below attributes for model generation:
Our goal is to limit our focus to Professional occupations, build a model, then test that generated model on the Administration segment of the population.
%%time
print(len(OPMDataMerged))
#Removing Attributes
cols = list(OPMDataMerged.columns)
dropCols = ["QTR",
"AGYTYP",
"AGYTYPT",
"AGY",
"AGYT",
"AGYSUB",
"AGYSUBT",
"GENDER",
"COUNT",
"PAYPLAN",
"PAYPLANT",
"PPGRD",
"LOSLVL",
"LOSLVLT",
"SALLVL",
"SALLVLT",
"OCC",
"OCCT"]
for i in dropCols:
if i in cols:
cols.remove(i)
OPMDataMerged = OPMDataMerged[cols]
# Keep only Full-time Nonseasonal observations
OPMDataMerged = OPMDataMerged[OPMDataMerged["WORKSCH"] == "F"]
#Remove the location US-SUPPRESSED (SEE DATA DEFINITIONS)
OPMDataMerged = OPMDataMerged[OPMDataMerged["LOC"] != "US"]
#Keep only General Schedele Grades above 7.
OPMDataMerged["GSEGRD"] = OPMDataMerged["GSEGRD"].apply(pd.to_numeric)
OPMDataMerged = OPMDataMerged[OPMDataMerged["GSEGRD"] >= 7]
#Focus model generation on White Collar Jobs only
OPMDataMerged = OPMDataMerged[OPMDataMerged["OCCTYP"] == "1"]
#Create a Training set for the Professional PATCO value, and a Testing set for Administration
OPMDataMergedProf = OPMDataMerged[OPMDataMerged["PATCO"] == "1"]
OPMDataMergedAdmin = OPMDataMerged[OPMDataMerged["PATCO"] == "2"]
display(OPMDataMergedProf.head())
print(len(OPMDataMergedProf))
display(OPMDataMergedAdmin.head())
print(len(OPMDataMergedAdmin))
#curious on stratum SEP counts for full remaining data
stratum = pd.DataFrame({'StratCount' : OPMDataMerged.groupby(["SEP"]).size()}).reset_index()
display(stratum)
#Assess Stratum SEP Counts for Prof, for use in sampling
maxSize=7500
stratumProf = pd.DataFrame({'StratCount' : OPMDataMergedProf.groupby(["SEP"]).size()}).reset_index()
stratumProf.loc[stratumProf["StratCount"]>maxSize,"StratCountSample"] = maxSize
stratumProf.loc[stratumProf["StratCount"]<=maxSize,"StratCountSample"] = stratumProf["StratCount"]
#else: stratum["StratCountSample"] = stratum["StratCount"]
display(stratumProf)
#Assess Stratum SEP Counts for Admin, for use in sampling
maxSize=7500
stratumAdmin = pd.DataFrame({'StratCount' : OPMDataMergedAdmin.groupby(["SEP"]).size()}).reset_index()
stratumAdmin.loc[stratumAdmin["StratCount"]>maxSize,"StratCountSample"] = maxSize
stratumAdmin.loc[stratumAdmin["StratCount"]<=maxSize,"StratCountSample"] = stratumAdmin["StratCount"]
#else: stratum["StratCountSample"] = stratum["StratCount"]
display(stratumAdmin)
%%time
def aggStratPop(stratum, OPMDataMerged):
AggStrat = []
for i in range(0,len(stratum)):
sep = stratum["SEP"].ix[i]
StratCountSample = stratum["StratCountSample"].ix[i]
print("Stratum Sample Size Calculations for SEP: {}".format(sep))
AggStrat.append(pd.DataFrame({'StratCount' : OPMDataMerged[OPMDataMerged["SEP"]==sep].groupby(["DATECODE", "AGELVL"]).size()}).reset_index())
AggStrat[i]["SEP"] = sep
AggStrat[i]["TotalCount"] = len(OPMDataMerged[OPMDataMerged["SEP"]==sep])
AggStrat[i]["p"] = AggStrat[i]["StratCount"] / AggStrat[i]["TotalCount"]
AggStrat[i]["StratCountSample"] = StratCountSample
AggStrat[i]["StratSampleSize"] = round(AggStrat[i]["p"] * StratCountSample).apply(int)
display(AggStrat[i].head())
print("totalStratumSampleSize: ", AggStrat[i]["StratSampleSize"].sum())
#print(len(AggStrat[i]))
return AggStrat
def SampleStrata(stratum, OPMDataMerged, FileName):
AggStrat = aggStratPop(stratum, OPMDataMerged)
SampledOPMStratumDataList = []
for i,StratSampleSize in enumerate(AggStrat):
SampledOPMStratumData = []
for j in range(0,len(StratSampleSize)):
SEP = StratSampleSize["SEP"].ix[j]
DATECODE = StratSampleSize["DATECODE"].ix[j]
AGELVL = StratSampleSize["AGELVL"].ix[j]
SampleSize = StratSampleSize["StratSampleSize"].ix[j]
print(SEP, DATECODE, AGELVL, SampleSize)
SampledOPMStratumDataList.append(OPMDataMerged[(OPMDataMerged["SEP"]==SEP)
& (OPMDataMerged["DATECODE"]==DATECODE)
& (OPMDataMerged["AGELVL"]==AGELVL)].sample(SampleSize, random_state=SampleSize))
SampledOPMStratumData.append(pd.concat(SampledOPMStratumDataList))
clear_display()
SampledOPMData = pd.concat(SampledOPMStratumData).reset_index()
del SampledOPMData["index"]
pickleObject(SampledOPMData, FileName)
clear_display()
return SampledOPMData
Using a seed value equal to each strata sample size, we take random samples according to the computed sizes above. We loop through each Separation Type's Aggregated Strata Sample Sizes; Identify all observations matching on Datecode, Separation Type, and AgeLevel; and finally sample those observations with the computed sample size.
%%time
##Prof Data Sampling
if os.path.isfile(PickleJarPath+"/SampledOPMDataProf.pkl"):
print("Found the File! Loading Pickle Now!")
SampledOPMDataProf = unpickleObject("SampledOPMDataProf")
else:
SampledOPMDataProf= SampleStrata(stratumProf, OPMDataMergedProf, "SampledOPMDataProf")
%%time
print(len(SampledOPMDataProf))
display(SampledOPMDataProf.head())
display(pd.DataFrame({'StratCount' : SampledOPMDataProf.groupby(["SEP"]).size()}).reset_index())
%%time
#### Analyze Missing Values
filtered_msnoData = msno.nullity_sort(msno.nullity_filter(SampledOPMDataProf, filter='bottom', n=15, p=0.999), sort='descending')
msno.matrix(filtered_msnoData)
del filtered_msnoData
%%time
##Admin Data Sampling
if os.path.isfile(PickleJarPath+"/SampledOPMDataAdmin.pkl"):
print("Found the File! Loading Pickle Now!")
SampledOPMDataAdmin = unpickleObject("SampledOPMDataAdmin")
else:
SampledOPMDataAdmin= SampleStrata(stratumAdmin, OPMDataMergedAdmin, "SampledOPMDataAdmin")
%%time
print(len(SampledOPMDataAdmin))
display(SampledOPMDataAdmin.head())
display(pd.DataFrame({'StratCount' : SampledOPMDataAdmin.groupby(["SEP"]).size()}).reset_index())
%%time
#### Analyze Missing Values
filtered_msnoData = msno.nullity_sort(msno.nullity_filter(SampledOPMDataAdmin, filter='bottom', n=15, p=0.999), sort='descending')
msno.matrix(filtered_msnoData)
del filtered_msnoData
%%time
## Describe Summary for our Model Professional Subgroup for Modeling
display(SampledOPMDataProf.describe().transpose())
#%%time
#OPMDataMerged.to_csv("OPMDataMerged.csv")
#os.path.getsize("OPMDataMerged.csv") #Display file size in bytes
Chris... can you use the SampledOPMDataProf dataset, and re-run the Visuals?
%%time
cols = list(SampledOPMDataProf.select_dtypes(include=['float64', 'int64']))
cols.remove('BLS_FEDERAL_OtherSep_Rate')
cols.remove('BLS_FEDERAL_Quits_Rate')
cols.remove('BLS_FEDERAL_TotalSep_Level')
cols.remove('BLS_FEDERAL_JobOpenings_Rate')
cols.remove('BLS_FEDERAL_OtherSep_Level')
cols.remove('BLS_FEDERAL_Quits_Level')
cols.remove('BLS_FEDERAL_JobOpenings_Level')
cols.remove('BLS_FEDERAL_Layoffs_Rate')
cols.remove('BLS_FEDERAL_Layoffs_Level')
cols.remove('BLS_FEDERAL_TotalSep_Rate')
cols.append('SEP')
display(cols)
plotNumeric = SampledOPMDataProf[cols]
# Create binary separation attribute for EDA correlation review
#plotNumeric["SEP_bin"] = plotNumeric.SEP.replace("NS", 1)
#plotNumeric.loc[plotNumeric['SEP_bin'] != 1, 'SEP_bin'] = 0
#plotNumeric.SEP_bin = plotNumeric.SEP_bin.apply(pd.to_numeric)
AttSplit = pd.get_dummies(plotNumeric['SEP'],prefix='SEP')
display(AttSplit.head())
plotNumeric = pd.concat((plotNumeric,AttSplit),axis=1) # add back into the dataframe
display(plotNumeric.head())
print("plotNumeric has {0} Records".format(len(plotNumeric)))
#print(plotNumeric.SEP_bin.dtype)
%%time
sns.set(font_scale=1)
sns.pairplot(plotNumeric.drop(["SEP_NS",
"SEP_SA",
"SEP_SB",
"SEP_SC",
"SEP_SD",
"SEP_SE",
"SEP_SF",
"SEP_SG",
"SEP_SH",
"SEP_SI",
"SEP_SJ",
"SEP_SK",
"SEP_SL"
], axis=1), hue = 'SEP', palette="hls", plot_kws={"s": 50})
%%time
# Function modified from https://stackoverflow.com/questions/29530355/plotting-multiple-histograms-in-grid
sns.set()
def draw_histograms(df, variables, n_rows, n_cols):
fig=plt.figure(figsize=(20,20))
for i, var_name in enumerate(variables):
ax=fig.add_subplot(n_rows,n_cols,i+1)
df[var_name].hist(bins=20,ax=ax, color='#58D68D')
ax.set_title(var_name+" Distribution")
fig.tight_layout() # Improves appearance a bit.
plt.show()
draw_histograms(plotNumeric.drop(['SEP',
"SEP_NS",
"SEP_SA",
"SEP_SB",
"SEP_SC",
"SEP_SD",
"SEP_SE",
"SEP_SF",
"SEP_SG",
"SEP_SH",
"SEP_SI",
"SEP_SJ",
"SEP_SK",
"SEP_SL"
], axis=1),
plotNumeric.drop(['SEP',
"SEP_NS",
"SEP_SA",
"SEP_SB",
"SEP_SC",
"SEP_SD",
"SEP_SE",
"SEP_SF",
"SEP_SG",
"SEP_SH",
"SEP_SI",
"SEP_SJ",
"SEP_SK",
"SEP_SL"
], axis=1).columns, 6, 3)
%%time
# Inspired by http://seaborn.pydata.org/examples/many_pairwise_correlations.html
#plt.matshow(plotNumeric.corr())
sns.set(style='white')
corr = plotNumeric.drop(['SEP'], axis=1).corr()
# Generate a mask for the upper triangle
mask = np.zeros_like(corr, dtype=np.bool)
mask[np.triu_indices_from(mask, k=1)] = True
# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(20, 20))
# Generate a custom diverging colormap
cmap = sns.diverging_palette(250, 10, as_cmap=True)
# Draw the heatmap with the mask and correct aspect ratio
sns.set(font_scale=0.95)
heatCorr = sns.heatmap(corr, mask=mask, cmap=cmap, vmax=1, vmin=-1,
square=True, annot=True, linewidths=1,
cbar_kws={"shrink": .5}, ax=ax, fmt='.1g')
#heatCorr.
ax.tick_params(labelsize=15)
cax = plt.gcf().axes[-1]
cax.tick_params(labelsize=15)
sns.plt.show()
#sns.heatmap(corr, annot=True, linewidths=0.01, cmap=cmap, ax=ax)
%%time
cols = list(SampledOPMDataProf.select_dtypes(include=['object']))
dropCols = ["LOCTYP",
"LOCTYPT",
"OCCTYP",
"OCCTYPT",
"PPTYP",
"PPTYPT",
"AGYTYP",
"OCCFAM",
"PPGROUP",
"PAYPLAN",
"TOATYP",
"WSTYP",
"AGYSUBT",
"AGELVL",
"LOSLVL",
"LOC",
"OCC",
"PATCO",
"SALLVL",
"TOA",
"WORKSCH"]
for i in dropCols:
if(i in list(SampledOPMDataProf.columns)): cols.remove(i)
plotCat = SampledOPMDataProf[cols]
display(plotCat.head())
print("plotCat Has {0} Records".format(len(plotCat)))
print("Number of colums = ", len(cols))
%%time
for i in cols:
if i != 'SEP':
plt.figure(i) # Required to create new figure each loop rather than drawing over previous object
f, (ax1, ax2) = plt.subplots(ncols=2, figsize=(20, 10), sharey=False)
sns.countplot(y=i, data=plotCat, color="lightblue", ax=ax1);
sns.countplot(y=i, data=plotCat, hue="SEP", palette="hls", ax=ax2);
if i == 'AGYSUB':
subCountPlot(i, 'SEP', 10000)
elif i == 'LOCT':
subCountPlot(i, 'SEP', 1000)
elif i == 'OCCT':
subCountPlot(i, 'SEP', 2000)
elif i == 'PPGRD':
subCountPlot(i, 'SEP', 6000)
elif i == 'AGYT':
subCountPlot(i, 'SEP', 3000)
%%time
for i in cols:
if i != 'SEP':
percBarPlot(i, 'SEP', len(plotCat.SEP.drop_duplicates()))
%%time
sns.set(style="whitegrid", palette="pastel", color_codes=True)
sns.violinplot(x="PATCOT", y="SALARY", data=SampledOPMDataProf, split=True,
inner="quart")
sns.despine(left=True)
%%time
# Draw a nested violinplot and split the violins for easier comparison
sns.violinplot(x="SEP", y="SALARY", data=SampledOPMDataProf, split=True,
inner="box", scale="area", cut=0)
sns.despine(left=True)
#%%time
#
#sns.factorplot(x="SEP", y="SALARY", col="PATCOT",
# data=SampledOPMDataProf,
# kind="violin", split=True, aspect=.5, size=15);
#%%time
#
#sns.factorplot(x="SEP", y="SALARY", col="PATCOT", data=SampledOPMDataProf,
# kind="violin", split=True, aspect=.4, size=10);
%%time
g = sns.PairGrid(data=SampledOPMDataProf,
x_vars=["SEP","PATCOT"],
y_vars=["SALARY", "LOS", "LowerLimitAge", "YearsToRetirement"],
aspect=1, size=10)
g.map(sns.violinplot, palette="pastel", inner="quart");
There are several separation types we would like to either roll up, or remove altogether.
We have chosen to roll separation into two Binary Categories.
1) NS, Non-Separation comprised of: a) NS, Non-Separation b) SA,Transfer Out - Individual Transfer c) SB,Transfer Out - Mass Transfer d) SD,Retirement - Voluntary e) SE,Retirement - Early Out f) SF,Retirement - Disability g) SG,Retirement - Other
2) SC, Quit
SampledOPMDataProf = SampledOPMDataProf[((SampledOPMDataProf["SEP"] == "NS") | (SampledOPMDataProf["SEP"] == "SD") | (SampledOPMDataProf["SEP"] == "SE") | (SampledOPMDataProf["SEP"] == "SF") | (SampledOPMDataProf["SEP"] == "SH") | (SampledOPMDataProf["SEP"] == "SA") | (SampledOPMDataProf["SEP"] == "SB") | (SampledOPMDataProf["SEP"] == "SC"))]
SampledOPMDataProf.loc[(SampledOPMDataProf["SEP"] != "SC") , "SEP"]="NS"
SampledOPMDataAdmin = SampledOPMDataAdmin[((SampledOPMDataAdmin["SEP"] == "NS") | (SampledOPMDataAdmin["SEP"] == "SD") | (SampledOPMDataAdmin["SEP"] == "SE") | (SampledOPMDataAdmin["SEP"] == "SF") | (SampledOPMDataAdmin["SEP"] == "SH") | (SampledOPMDataAdmin["SEP"] == "SA") | (SampledOPMDataAdmin["SEP"] == "SB") | (SampledOPMDataAdmin["SEP"] == "SC"))]
SampledOPMDataAdmin.loc[(SampledOPMDataAdmin["SEP"] != "SC") , "SEP"]="NS"
#Assess Stratum SEP Counts for Prof, for use in sampling
maxSize=7500
stratumProf = pd.DataFrame({'StratCount' : SampledOPMDataProf.groupby(["SEP"]).size()}).reset_index()
stratumProf.loc[stratumProf["StratCount"]>maxSize,"StratCountSample"] = maxSize
stratumProf.loc[stratumProf["StratCount"]<=maxSize,"StratCountSample"] = stratumProf["StratCount"]
#else: stratum["StratCountSample"] = stratum["StratCount"]
display(stratumProf)
#Assess Stratum SEP Counts for Admin, for use in sampling
maxSize=7500
stratumAdmin = pd.DataFrame({'StratCount' : SampledOPMDataAdmin.groupby(["SEP"]).size()}).reset_index()
stratumAdmin.loc[stratumAdmin["StratCount"]>maxSize,"StratCountSample"] = maxSize
stratumAdmin.loc[stratumAdmin["StratCount"]<=maxSize,"StratCountSample"] = stratumAdmin["StratCount"]
#else: stratum["StratCountSample"] = stratum["StratCount"]
display(stratumAdmin)
SampledOPMDataProf= SampleStrata(stratumProf, SampledOPMDataProf, "SampledOPMDataProfBinary")
SampledOPMDataAdmin= SampleStrata(stratumProf, SampledOPMDataAdmin, "SampledOPMDataProfBinary")
%%time
cols = list(SampledOPMDataProf.select_dtypes(include=['float64', 'int64']))
cols.remove('BLS_FEDERAL_OtherSep_Rate')
cols.remove('BLS_FEDERAL_Quits_Rate')
cols.remove('BLS_FEDERAL_TotalSep_Level')
cols.remove('BLS_FEDERAL_JobOpenings_Rate')
cols.remove('BLS_FEDERAL_OtherSep_Level')
cols.remove('BLS_FEDERAL_Quits_Level')
cols.remove('BLS_FEDERAL_JobOpenings_Level')
cols.remove('BLS_FEDERAL_Layoffs_Rate')
cols.remove('BLS_FEDERAL_Layoffs_Level')
cols.remove('BLS_FEDERAL_TotalSep_Rate')
cols.append('SEP')
display(cols)
plotNumeric = SampledOPMDataProf[cols]
# Create binary separation attribute for EDA correlation review
#plotNumeric["SEP_bin"] = plotNumeric.SEP.replace("NS", 1)
#plotNumeric.loc[plotNumeric['SEP_bin'] != 1, 'SEP_bin'] = 0
#plotNumeric.SEP_bin = plotNumeric.SEP_bin.apply(pd.to_numeric)
AttSplit = pd.get_dummies(plotNumeric['SEP'],prefix='SEP')
display(AttSplit.head())
plotNumeric = pd.concat((plotNumeric,AttSplit),axis=1) # add back into the dataframe
display(plotNumeric.head())
print("plotNumeric has {0} Records".format(len(plotNumeric)))
#print(plotNumeric.SEP_bin.dtype)
%%time
sns.set(font_scale=1)
sns.pairplot(plotNumeric.drop(['SEP_NS',
'SEP_SC'], axis=1), hue = 'SEP', palette="hls", plot_kws={"s": 50})
%%time
# Function modified from https://stackoverflow.com/questions/29530355/plotting-multiple-histograms-in-grid
sns.set()
def draw_histograms(df, variables, n_rows, n_cols):
fig=plt.figure(figsize=(20,20))
for i, var_name in enumerate(variables):
ax=fig.add_subplot(n_rows,n_cols,i+1)
df[var_name].hist(bins=20,ax=ax, color='#58D68D')
ax.set_title(var_name+" Distribution")
fig.tight_layout() # Improves appearance a bit.
plt.show()
draw_histograms(plotNumeric.drop(['SEP',
'SEP_NS',
'SEP_SC'
], axis=1),
plotNumeric.drop(['SEP',
'SEP_NS',
'SEP_SC'
], axis=1).columns, 6, 3)
%%time
# Inspired by http://seaborn.pydata.org/examples/many_pairwise_correlations.html
#plt.matshow(plotNumeric.corr())
sns.set(style='white')
corr = plotNumeric.drop(['SEP'], axis=1).corr()
# Generate a mask for the upper triangle
mask = np.zeros_like(corr, dtype=np.bool)
mask[np.triu_indices_from(mask, k=1)] = True
# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(20, 20))
# Generate a custom diverging colormap
cmap = sns.diverging_palette(250, 10, as_cmap=True)
# Draw the heatmap with the mask and correct aspect ratio
sns.set(font_scale=0.95)
heatCorr = sns.heatmap(corr, mask=mask, cmap=cmap, vmax=1, vmin=-1,
square=True, annot=True, linewidths=1,
cbar_kws={"shrink": .5}, ax=ax, fmt='.1g')
#heatCorr.
ax.tick_params(labelsize=15)
cax = plt.gcf().axes[-1]
cax.tick_params(labelsize=15)
sns.plt.show()
#sns.heatmap(corr, annot=True, linewidths=0.01, cmap=cmap, ax=ax)
%%time
cols = list(SampledOPMDataProf.select_dtypes(include=['object']))
dropCols = ["LOCTYP",
"LOCTYPT",
"OCCTYP",
"OCCTYPT",
"PPTYP",
"PPTYPT",
"AGYTYP",
"OCCFAM",
"PPGROUP",
"PAYPLAN",
"TOATYP",
"WSTYP",
"AGYSUBT",
"AGELVL",
"LOSLVL",
"LOC",
"OCC",
"PATCO",
"SALLVL",
"TOA",
"WORKSCH"]
for i in dropCols:
if(i in list(SampledOPMDataProf.columns)): cols.remove(i)
plotCat = SampledOPMDataProf[cols]
display(plotCat.head())
print("plotCat Has {0} Records".format(len(plotCat)))
print("Number of colums = ", len(cols))
%%time
for i in cols:
if i != 'SEP':
plt.figure(i) # Required to create new figure each loop rather than drawing over previous object
f, (ax1, ax2) = plt.subplots(ncols=2, figsize=(20, 10), sharey=False)
sns.countplot(y=i, data=plotCat, color="lightblue", ax=ax1);
sns.countplot(y=i, data=plotCat, hue="SEP", palette="hls", ax=ax2);
if i == 'AGYSUB':
subCountPlot(i, 'SEP', 10000)
elif i == 'LOCT':
subCountPlot(i, 'SEP', 1000)
elif i == 'OCCT':
subCountPlot(i, 'SEP', 2000)
elif i == 'PPGRD':
subCountPlot(i, 'SEP', 6000)
elif i == 'AGYT':
subCountPlot(i, 'SEP', 3000)
%%time
for i in cols:
if i != 'SEP':
percBarPlot(i, 'SEP', len(plotCat.SEP.drop_duplicates()))
%%time
sns.set(style="whitegrid", palette="pastel", color_codes=True)
sns.violinplot(x="PATCOT", y="SALARY", data=SampledOPMDataProf, split=True,
inner="quart")
sns.despine(left=True)
%%time
# Draw a nested violinplot and split the violins for easier comparison
sns.violinplot(x="SEP", y="SALARY", data=SampledOPMDataProf, split=True,
inner="box", scale="area", cut=0)
sns.despine(left=True)
#%%time
#
#sns.factorplot(x="SEP", y="SALARY", col="PATCOT",
# data=SampledOPMDataProf,
# kind="violin", split=True, aspect=.5, size=15);
#%%time
#
#sns.factorplot(x="SEP", y="SALARY", col="PATCOT", data=SampledOPMDataProf,
# kind="violin", split=True, aspect=.4, size=10);
%%time
g = sns.PairGrid(data=SampledOPMDataProf,
x_vars=["SEP","PATCOT"],
y_vars=["SALARY", "LOS", "LowerLimitAge", "YearsToRetirement"],
aspect=1, size=10)
g.map(sns.violinplot, palette="pastel", inner="quart");
Now that we have the dataset sampled, we still have some legwork necessary to convert our categorical attributes into binary integer values. Below we walk through this process for the following Attributes:
Once these attributes have been encoded and description columns removed, we end up with a total of 2446 attributes in our dataset for analysis in our model generation.
# Clean up old objects no longer needed, to clear up memory
process = psutil.Process(os.getpid())
print("Memory Usage before Cleanup: ", process.memory_info().rss)
if 'AGELVL' in dir():
del AGELVL
if 'AggIndAvgSalary' in dir():
del AggIndAvgSalary
if 'AggIndAvgSalary2' in dir():
del AggIndAvgSalary2
if 'AggSEPCount_EFDATE_LOC' in dir():
del AggSEPCount_EFDATE_LOC
if 'AggSEPCount_EFDATE_OCC' in dir():
del AggSEPCount_EFDATE_OCC
if 'AggStrat' in dir():
del AggStrat
if 'DATECODE' in dir():
del DATECODE
if 'EMPColList' in dir():
del EMPColList
if 'EMPDataOrig4Q' in dir():
del EMPDataOrig4Q
if 'maxSize' in dir():
del maxSize
if 'OPMColList' in dir():
del OPMColList
if 'OPMDataFiles' in dir():
del OPMDataFiles
if 'OPMDataList' in dir():
del OPMDataList
if 'OPMDataMerged' in dir():
del OPMDataMerged
if 'OPMDataOrig' in dir():
del OPMDataOrig
if 'SEP' in dir():
del SEP
if 'SampleSize' in dir():
del SampleSize
if 'SampledOPMStratumData' in dir():
del SampledOPMStratumData
if 'SampledOPMStratumDataList' in dir():
del SampledOPMStratumDataList
if 'StratCountSample' in dir():
del StratCountSample
if 'StratSampleSize' in dir():
del StratSampleSize
if 'JTL' in dir():
del JTL
process = psutil.Process(os.getpid())
print("Memory Usage after Cleanup: ", process.memory_info().rss)
display(SampledOPMDataProf.head())
SampledOPMDataProf.info()
%%time
if os.path.isfile(PickleJarPath+"/OPMAnalysisDataNoFamBinary.pkl"):
print("Found the File! Loading Pickle Now!")
OPMAnalysisDataNoFamBinary = unpickleObject("OPMAnalysisDataNoFamBinary")
else:
OPMAnalysisDataNoFamBinary = SampledOPMDataProf.copy()
cols = ["GENDER",
"DATECODE",
"QTR",
"COUNT",
"AGYTYPT",
"AGYT",
"AGYSUB",
"AGYSUBT",
"QTR",
"AGELVLT",
"LOSLVL",
"LOSLVLT",
"LOCTYPT",
"LOCT",
"OCCTYP",
"OCCTYPT",
"OCCFAM",
"OCCFAMT",
"OCC",
"OCCT",
"PATCO",
"PPGRD",
"PATCOT",
"PPTYPT",
"PPGROUPT",
"PAYPLAN",
"PAYPLANT",
"SALLVLT",
"TOATYPT",
"TOAT",
"WSTYP",
"WSTYPT",
"WORKSCH",
"WORKSCHT",
"SALARY",
"LOS",
"SEPCount_EFDATE_OCC",
"SEPCount_EFDATE_LOC"
]
#delete cols from analysis data
for col in cols:
if col in list(OPMAnalysisDataNoFamBinary.columns):
del OPMAnalysisDataNoFamBinary[col]
OPMAnalysisDataNoFamBinary.info()
cols = ["AGELVL",
"LOC",
"SALLVL",
"TOA",
"AGYTYP",
"AGY",
"LOCTYP",
"PPTYP",
"PPGROUP",
"TOATYP"
]
#Split Values for cols
for col in cols:
if col in list(OPMAnalysisDataNoFamBinary.columns):
AttSplit = pd.get_dummies(OPMAnalysisDataNoFamBinary[col],prefix=col)
display(AttSplit.head())
OPMAnalysisDataNoFamBinary = pd.concat((OPMAnalysisDataNoFamBinary,AttSplit),axis=1) # add back into the dataframe
del OPMAnalysisDataNoFamBinary[col]
pickleObject(OPMAnalysisDataNoFamBinary, "OPMAnalysisDataNoFamBinary")
display(OPMAnalysisDataNoFamBinary.head())
print("Number of Columns: ",len(OPMAnalysisDataNoFamBinary.columns))
OPMAnalysisDataNoFamBinary.info()
Below is a display of all remaining attributes and their corresponding data types for analysis
%%time
data_type = []
for idx, col in enumerate(OPMAnalysisDataNoFamBinary.columns):
data_type.append(OPMAnalysisDataNoFamBinary.dtypes[idx])
summary_df = {'Attribute Name' : pd.Series(OPMAnalysisDataNoFamBinary.columns, index = range(len(OPMAnalysisDataNoFamBinary.columns))), 'Data Type' : pd.Series(data_type, index = range(len(OPMAnalysisDataNoFamBinary.columns)))}
summary_df = pd.DataFrame(summary_df)
display(summary_df)
del data_type, summary_df
We also scale the data values to remove bias in our models due to different attribute scales. Without scaling the data, attributes such as SALARY and LOS would carry heavier weights when compared against the binary encoded attributes and BLS data. This would cause unbalanced and improperly analyzed data for model creation.
OPMScaledAnalysisData = OPMAnalysisDataNoFamBinary.copy()
del OPMScaledAnalysisData["SEP"]
%%time
OPMAnalysisScalerFit = MinMaxScaler().fit(OPMScaledAnalysisData)
## Pickle for later re-use if needed
pickleObject(OPMAnalysisScalerFit, "OPMAnalysisScalerFit")
OPMScaledAnalysisData = pd.DataFrame(OPMAnalysisScalerFit.transform(OPMScaledAnalysisData), columns = OPMScaledAnalysisData.columns)
display(OPMScaledAnalysisData.head())
Our objective, is to reduce dimensionality through identification of principal components. We have chosen to use the full column input (99) as the maximum number of components to be produced. Given our hopes are to reduce the number of attributes needed for a model, we expect to find much smaller than 99 as our Principal components which explain over 80% variance within the dataset. We will review each component's explained variance further to determine the proper number of components to be included later during model generation. Note randomized PCA was chosen in order to use singular value decomposition in our dimensionality reduction efforts due to the large size of our data set.
%%time
seed = len(OPMScaledAnalysisData)
print(OPMScaledAnalysisData.shape)
pca_class = PCA(n_components=len(OPMScaledAnalysisData.columns), svd_solver='randomized', random_state=seed)
pca_class.fit(OPMScaledAnalysisData)
Below, the resulting components have been ordered by eigenvector value and these values portrayed as ratios of variance explained by each component. In order to identify the principal components to be included during model generation, we review the rate at which explained variance decreases in significance from one principal component to the next. Accompanying these proportion values is a scree plot representing these same values in visual form. By plotting the scree plot, it is easier to judge where this rate of decreasing explained variance occurs. Note the rate of change in explained variance among the first 8 principal components, with another less significant change through the 22th component. After the 22th component, the rate of decreasing explained variance begins to somewhat flatten out.
%%time
#The amount of variance that each PC explains
var= pca_class.explained_variance_ratio_
sns.set(font_scale=1.7)
plt.plot(range(1,len(OPMScaledAnalysisData.columns)+1), var*100, marker = '.', color = 'red', markerfacecolor = 'black')
plt.xlabel('Principal Components')
plt.ylabel('Percentage of Explained Variance')
plt.title('Scree Plot')
plt.axis([0, len(OPMScaledAnalysisData.columns)+1, -0.1, 9])
plt.annotate('22nd Component', xy=(22, 1.2), xytext=(40, 4),
arrowprops=dict(facecolor='black', shrink=0.05),)
np.set_printoptions(suppress=True)
print(np.round(var, decimals=4)*100)
By now referring to the cumulative variance values and associated plot below, it may be seen that the cumulative variance increases in a fairly consistent parabola curve. In attempts to acheive a cumulative variance explained of greater than 80%, we end at 22 principal components. For this reason, 22 principal components may be selected as being the most appropriate for separation classification modeling given the variables among these data.
#Cumulative Variance explains
var1=np.cumsum(np.round(pca_class.explained_variance_ratio_, decimals=4)*100)
plt.plot(range(1,len(OPMScaledAnalysisData.columns)+1), var1, marker = '.', color = 'green', markerfacecolor = 'black')
plt.xlabel('Principal Components')
plt.ylabel('Explained Variance (Sum %)')
plt.title('Cumulative Variance Plot')
plt.axis([0, len(OPMScaledAnalysisData.columns)+1, 10, len(OPMScaledAnalysisData.columns)+1])
plt.annotate('22nd Component', xy=(22, 80.54), xytext=(40, 60),
arrowprops=dict(facecolor='black', shrink=0.05),)
print(var1)
We proceed to analyze the first 4 component Feature Loadings more carefully. See below, plots of the top 10 loadings for each component.
plt.rcParams['figure.figsize'] = (20, 12)
fig = plt.figure()
plt.rcParams.update({'font.size': 16})
plt.rc('xtick', labelsize=15)
plt.rc('ytick', labelsize=15)
for i in range(0,4):
components = pd.Series(pca_class.components_[i], index=OPMScaledAnalysisData.columns)
maxcomponent = pd.Series(pd.DataFrame(abs(components).sort_values(ascending=False).head(10)).index)
matplotlib.rc('xtick', labelsize=12)
ax = fig.add_subplot(2,2,i + 1)
weightsplot = pd.Series(components, index=maxcomponent)
weightsplot.plot(title = "Principal Component "+ str(i+1), kind='bar', color = 'Tomato', ax = ax)
plt.tight_layout()
plt.show()
MaxPC = 22
PCList = []
for i in range(0,MaxPC):
components = pd.Series(pca_class.components_[i], index=OPMScaledAnalysisData.columns)
maxcomponent = pd.Series(pd.DataFrame(abs(components).sort_values(ascending=False).head(15)).index)
PCList.append(maxcomponent)
PCList = pd.concat(PCList).drop_duplicates().sort_values(ascending=True).reset_index(drop = True)
print(PCList)
PCList = list(PCList)
Total of 50 features of the original 99 are identified, by taking the top 15 feature loadings within the first 22 components as determined above as the appropriate components to maximize variance explained. We may now, optionally utilize these 50 features identified, or utilize principal component vectors for analysis in the next steps.
Due to the unproportional number of observations in each separation type in our dataset, we need to create weightings. using SciKit's class_weight algorithm, we compute an array of weights to be used downstream in our models.
We have chosen to utilize Stratified KFold Cross Validation for our classification analysis, with 5 folds. This means, that from our original sample size of 16,638, each "fold" will save off approximately 20% as test observations utilizing the rest as training observations all while keeping the ratio of classes equal amongst customers and subscribers. This process will occur through 5 iterations, or folds, to allow us to cross validate our results amongst different test/train combinations. We have utilized a random_state seed equal to the length of the original sampled dataset to ensure reproducible results.
seed = len(OPMAnalysisDataNoFamBinary)
cv = StratifiedKFold(n_splits = 5, random_state = seed)
print(OPMAnalysisDataNoFamBinary.shape)
print(cv)
Max Depth The maximum depth (levels) in the tree. When a value is set, the tree may not split further once this level has been met regardless of how many nodes are in the leaf.
Max Features Number of features to consider when looking for a split.
Minimum Samples in Leaf Minimum number of samples required to be in a leaf node. Splits may not occur which cause the number of samples in a leaf to be less than this value. Too low a value here leads to overfitting the tree to train data.
Minimum Samples to Split Minimum number fo samples required to split a node. Care was taken during parameter tests to keep the ratio between Min Samples in Leaf and Min Samples to Split equal to that of the default values (1:2). This was done to allow an even 50/50 split on nodes which match the lowest granularity split criteria. similar to the min samples in leaf, too low a value here leads to overfitting the tree to train data.
n_estimators Number of Trees generated in the forest. Increasing the number of trees, in our models increased accuracy while decreasing performance. We tuned to provide output that completed all 10 iterations in under 10 minutes.
| max_depth | max_features | min_samples_leaf | min_samples_split | n_estimators |
|---|---|---|---|---|
| TBD | TBD | TBD | TBD | TBD |
%%time
"""
def rfc_explor(n_estimators,
max_features,
max_depth,
min_samples_split,
min_samples_leaf,
Data = OPMAnalysisDataNoFam,
cols = PCList,
cv = cv,
seed = seed):
startTime = datetime.now()
y = Data["SEP"].values # get the labels we want
X = Data[cols].as_matrix()
rfc_clf = RandomForestClassifier(n_estimators=n_estimators, max_features = max_features, max_depth=max_depth, min_samples_split = min_samples_split, min_samples_leaf = min_samples_leaf, class_weight = "balanced", n_jobs=-1, random_state = seed) # get object
# setup pipeline to take PCA, then fit a clf model
clf_pipe = Pipeline(
[('minMaxScaler', MinMaxScaler()),
('CLF',rfc_clf)]
)
accuracy = cross_val_score(clf_pipe, X, y, cv=cv.split(X, y)) # this also can help with parallelism
MeanAccuracy = sum(accuracy)/len(accuracy)
accuracy = np.append(accuracy, MeanAccuracy)
endTime = datetime.now()
TotalTime = endTime - startTime
accuracy = np.append(accuracy, TotalTime)
print(TotalTime)
print(accuracy)
return accuracy
"""
%%time
"""
def rfc_explor_w_PCA(n_estimators,
max_features,
max_depth,
min_samples_split,
min_samples_leaf,
PCA,
Data = OPMAnalysisDataNoFam,
cv = cv,
seed = seed):
startTime = datetime.now()
y = Data["SEP"].values # get the labels we want
X = Data.drop("SEP", axis=1).as_matrix()
rfc_clf = RandomForestClassifier(n_estimators=n_estimators, max_features = max_features, max_depth=max_depth, min_samples_split = min_samples_split, min_samples_leaf = min_samples_leaf, class_weight = "balanced", n_jobs=-1, random_state = seed) # get object
# setup pipeline to take PCA, then fit a clf model
clf_pipe = Pipeline(
[('minMaxScaler', MinMaxScaler()),
('PCA', PCA),
('CLF',rfc_clf)]
)
accuracy = cross_val_score(clf_pipe, X, y, cv=cv.split(X, y)) # this also can help with parallelism
MeanAccuracy = sum(accuracy)/len(accuracy)
accuracy = np.append(accuracy, MeanAccuracy)
endTime = datetime.now()
TotalTime = endTime - startTime
accuracy = np.append(accuracy, TotalTime)
#print(TotalTime)
#print(accuracy)
return accuracy
"""
%%time
"""
def rfc_explor_w_PCA(n_estimators,
max_features,
max_depth,
min_samples_split,
min_samples_leaf,
PCA,
Data = OPMAnalysisDataNoFam,
cv = cv,
seed = seed):
startTime = datetime.now()
y = Data["SEP"].values # get the labels we want
X = Data.drop("SEP", axis=1).as_matrix()
rfc_clf = RandomForestClassifier(n_estimators=n_estimators, max_features = max_features, max_depth=max_depth, min_samples_split = min_samples_split, min_samples_leaf = min_samples_leaf, class_weight = "balanced", n_jobs=-1, random_state = seed) # get object
# setup pipeline to take PCA, then fit a clf model
clf_pipe = Pipeline(
[('minMaxScaler', MinMaxScaler()),
('PCA', PCA),
('CLF',rfc_clf)]
)
accuracy = cross_val_score(clf_pipe, X, y, cv=cv.split(X, y)) # this also can help with parallelism
MeanAccuracy = sum(accuracy)/len(accuracy)
accuracy = np.append(accuracy, MeanAccuracy)
endTime = datetime.now()
TotalTime = endTime - startTime
accuracy = np.append(accuracy, TotalTime)
#print(TotalTime)
#print(accuracy)
return accuracy
"""
We have created a function to be re-used for our cross-validation Accuracy Scores. Inputs of PCA components, Model CLF object, original sample data, and a CV containing our test/train splits allow us to easily produce an array of Accuracy Scores for the different permutations of models tested. A XXXXXXTBDXXXXX plot is also displayed depicting a view of the misclassification values for each iteration. Finally, a confusion matrix is displayed for the last test/train iteration for further interpretation on results.
%%time
"""
acclist = []
fullColumns = list(OPMAnalysisDataNoFam.columns)
for i in fullColumns:
if i == "SEP": fullColumns.remove(i)
n_estimators = [10 , 10 , 10 , 10 , 10 , 10 , 10 , 10 , 10 , 10 , 10 , 5 , 15 ]
max_features = ['auto', 'auto' , 'auto', 'auto', 'auto', 'auto', 'auto', 14 , 14 , 14 , 14 , 14 , 14 ]
max_depth = [None , None , None , None , None , None , None , None , 1000 , 500 , 100 , 1000 , 1000 ]
min_samples_split = [2 , 8 , 12 , 16 , 20 , 50 , 80 , 50 , 50 , 50 , 50 , 50 , 50 ]
min_samples_leaf = [1 , 4 , 6 , 8 , 10 , 25 , 40 , 25 , 25 , 25 , 25 , 25 , 25 ]
##Model with all Raw Scaled Features
for i in range(0,len(n_estimators)):
acclist.append(rfc_explor(n_estimators = n_estimators[i],
max_features = max_features[i],
max_depth = max_depth[i],
min_samples_split = min_samples_split[i],
min_samples_leaf = min_samples_leaf[i],
cols = fullColumns
)
)
rfcdf = pd.DataFrame(pd.concat([pd.DataFrame({ "ModelVersion": "All Raw Features",
"n_estimators": n_estimators,
"max_features": max_features,
"max_depth": max_depth,
"min_samples_split": min_samples_split,
"min_samples_leaf": min_samples_leaf
}),
pd.DataFrame(acclist)], axis = 1).reindex())
rfcdf.columns = ['ModelVersion', 'max_depth', 'max_features', 'min_samples_leaf','min_samples_split', 'n_estimators', 'Iteration 0', 'Iteration 1', 'Iteration 2', 'Iteration 3', 'Iteration 4', 'MeanAccuracy', 'RunTime']
display(rfcdf)
del rfcdf, acclist
acclist = []
n_estimators = [10 , 10 , 10 , 10 , 10 , 10 , 10 , 10 , 10 , 10 , 10 , 5 , 15 ]
max_features = ['auto', 'auto' , 'auto', 'auto', 'auto', 'auto', 'auto', 14 , 14 , 14 , 14 , 14 , 14 ]
max_depth = [None , None , None , None , None , None , None , None , 1000 , 500 , 100 , 1000 , 1000 ]
min_samples_split = [2 , 8 , 12 , 16 , 20 , 50 , 80 , 50 , 50 , 50 , 50 , 50 , 50 ]
min_samples_leaf = [1 , 4 , 6 , 8 , 10 , 25 , 40 , 25 , 25 , 25 , 25 , 25 , 25 ]
## Model with only top 15 raw Scaled Principal Features
for i in range(0,len(n_estimators)):
acclist.append(rfc_explor(n_estimators = n_estimators[i],
max_features = max_features[i],
max_depth = max_depth[i],
min_samples_split = min_samples_split[i],
min_samples_leaf = min_samples_leaf[i]
)
)
rfcdf = pd.DataFrame(pd.concat([pd.DataFrame({ "ModelVersion": "Top 15 Raw from PC",
"n_estimators": n_estimators,
"max_features": max_features,
"max_depth": max_depth,
"min_samples_split": min_samples_split,
"min_samples_leaf": min_samples_leaf
}),
pd.DataFrame(acclist)], axis = 1).reindex())
rfcdf.columns = ['ModelVersion', 'max_depth', 'max_features', 'min_samples_leaf','min_samples_split', 'n_estimators', 'Iteration 0', 'Iteration 1', 'Iteration 2', 'Iteration 3', 'Iteration 4', 'MeanAccuracy', 'RunTime']
display(rfcdf)
del rfcdf, acclist
### Model with PCA
acclist = []
n_estimators = [10 , 10 , 10 , 10 , 10 , 10 , 10 , 10 , 10 , 10 , 10 , 5 , 15 ]
max_features = ['auto', 'auto' , 'auto', 'auto', 'auto', 'auto', 'auto', 14 , 14 , 14 , 14 , 14 , 14 ]
max_depth = [None , None , None , None , None , None , None , None , 1000 , 500 , 100 , 1000 , 1000 ]
min_samples_split = [2 , 8 , 12 , 16 , 20 , 50 , 80 , 50 , 50 , 50 , 50 , 50 , 50 ]
min_samples_leaf = [1 , 4 , 6 , 8 , 10 , 25 , 40 , 25 , 25 , 25 , 25 , 25 , 25 ]
for i in range(0,len(n_estimators)):
acclist.append(rfc_explor_w_PCA(n_estimators = n_estimators[i],
max_features = max_features[i],
max_depth = max_depth[i],
min_samples_split = min_samples_split[i],
min_samples_leaf = min_samples_leaf[i],
PCA = PCA(n_components=22, svd_solver='randomized', random_state = seed)
)
)
rfcdf = pd.DataFrame(pd.concat([pd.DataFrame({ "ModelVersion": "With PCA",
"n_estimators": n_estimators,
"max_features": max_features,
"max_depth": max_depth,
"min_samples_split": min_samples_split,
"min_samples_leaf": min_samples_leaf
}),
pd.DataFrame(acclist)], axis = 1).reindex())
rfcdf.columns = ['ModelVersion', 'max_depth', 'max_features', 'min_samples_leaf','min_samples_split', 'n_estimators', 'Iteration 0', 'Iteration 1', 'Iteration 2', 'Iteration 3', 'Iteration 4', 'MeanAccuracy', 'RunTime']
display(rfcdf)
#'Iteration 5', 'Iteration 6', 'Iteration 7', 'Iteration 8', 'Iteration 9',
"""
"""
def plot_confusion_matrix(cm, classes,
normalize=False,
title='Confusion matrix',
cmap=plt.cm.Blues):
#This function prints and plots the confusion matrix.
#Normalization can be applied by setting `normalize=True`.
plt.rcParams['figure.figsize'] = (18, 6)
plt.rcParams.update({'font.size': 16})
plt.rc('xtick', labelsize=18)
plt.rc('ytick', labelsize=18)
plt.imshow(cm, interpolation='nearest', cmap=cmap)
plt.title(title, fontsize = 18)
plt.colorbar()
tick_marks = np.arange(len(classes))
plt.xticks(tick_marks, classes, rotation=45)
plt.yticks(tick_marks, classes)
if normalize:
cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
print("Normalized confusion matrix")
else:
print('Confusion matrix, without normalization')
print(cm)
thresh = cm.max() / 2.
for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
plt.text(j, i, round(cm[i, j],2),
horizontalalignment="center",
color="white" if cm[i, j] > thresh else "black")
plt.tight_layout()
plt.ylabel('True label', fontsize = 18)
plt.xlabel('Predicted label', fontsize = 18)
plt.show()
"""
%%time
"""
def compute_kfold_scores_Classification( clf,
Data = OPMAnalysisDataNoFam,
cols = PCList,
cv = cv):
y = Data["SEP"].values # get the labels we want
y = np.where(y == 'NS', 0,
np.where(y == 'SA', 1,
np.where(y == 'SC', 2,
np.where(y == 'SD', 3,
np.where(y == 'SH', 4,
5
)
)
)
)
)
X = Data[cols].as_matrix()
# Run classifier with cross-validation and plot ROC curves
# setup pipeline to take PCA, then fit a clf model
clf_pipe = Pipeline(
[('minMaxScaler', MinMaxScaler()),
('CLF',clf)]
)
accuracy = []
#logloss = []
for (train, test), color in zip(cv.split(X, y), colors):
clf_pipe.fit(X[train],y[train]) # train object
y_hat = clf_pipe.predict(X[test]) # get test set preditions
a = float(mt.accuracy_score(y[test],y_hat))
#l = float(mt.log_loss(y[test], y_hat))
accuracy.append(round(a,5))
#logloss.append(round(l,5))
#print("Accuracy Ratings across all iterations: {0}\n\n\
#Average Accuracy: {1}\n\n\
#Log Loss Values across all iterations: {2}\n\n\
#Average Log Loss: {3}\n".format(accuracy, round(sum(accuracy)/len(accuracy),5), logloss,round(sum(logloss)/len(logloss),5)))
print("Accuracy Ratings across all iterations: {0}\n\n\
Average Accuracy: {1}\n".format(accuracy, round(sum(accuracy)/len(accuracy),5)))
ytestnames = np.where(y[test] == 0,'NS',
np.where(y[test] == 1,'SA',
np.where(y[test] == 2,'SC',
np.where(y[test] == 3,'SD',
np.where(y[test] == 4,'SH',
'SI'
)
)
)
)
)
yhatnames = np.where(y_hat == 0,'NS',
np.where(y_hat == 1,'SA',
np.where(y_hat == 2,'SC',
np.where(y_hat == 3,'SD',
np.where(y_hat == 4,'SH',
'SI'
)
)
)
)
)
#print(set(list(y_hat)))
print("confusion matrix\n{0}\n".format(pd.crosstab(ytestnames, yhatnames, rownames = ['True'], colnames = ['Predicted'], margins = True)))
# Plot non-normalized confusion matrix
plt.figure()
plot_confusion_matrix(confusion_matrix(y[test], y_hat),
classes =["NS", "SA", "SC", "SD", "SI"],
normalize =True,
title ='Confusion matrix, with normalization')
return clf_pipe.named_steps['CLF'], accuracy
"""
%%time
"""
rfc_clf = RandomForestClassifier(n_estimators = 15,
max_features = 14,
max_depth = 1000.0,
min_samples_split = 50,
min_samples_leaf = 25,
class_weight = "balanced",
n_jobs = -1,
random_state = seed) # get object
rfc_clf, rfc_acc = compute_kfold_scores_Classification(rfc_clf, cols = fullColumns)
"""
"""
list(OPMAnalysisDataNoFam.SEP.unique())
"""
'''%%time
def compute_kfold_scores_Classification( clf,
Data = OPMAnalysisDataNoFam,
cols = PCList,
cv = cv):
y = Data["SEP"].values # get the labels we want
y = np.where(y == 'NS', 0,
np.where(y == 'SA', 1,
np.where(y == 'SC', 2,
np.where(y == 'SD', 3,
np.where(y == 'SH', 4,
5
)
)
)
)
)
X = Data[cols].as_matrix()
# Binarize the output
y_bin = label_binarize(Data["SEP"].values, list(Data.SEP.unique()))
n_classes = y_bin.shape[1]
# Run classifier with cross-validation and plot ROC curves
# setup pipeline to take PCA, then fit a clf model
clf_pipe = Pipeline(
[('minMaxScaler', MinMaxScaler()),
('CLF',clf)]
)
colors = cycle(['cyan', 'indigo', 'seagreen', 'yellow', 'blue', 'darkorange', 'pink', 'darkred', 'dimgray', 'maroon', 'coral'])
accuracy = []
#logloss = []
for (train, test), color in zip(cv.split(X, y), colors):
clf_pipe.fit(X[train],y[train]) # train object
y_hat = clf_pipe.predict(X[test]) # get test set preditions
a = float(mt.accuracy_score(y[test],y_hat))
#l = float(mt.log_loss(y[test], y_hat))
accuracy.append(round(a,5))
#logloss.append(round(l,5))
# Compute ROC curve and area the curve
#fpr, tpr, thresholds = roc_curve(y[test], probas_[:, 1])
#mean_tpr += interp(mean_fpr, fpr, tpr)
#mean_tpr[0] = 0.0
#roc_auc = auc(fpr, tpr)
#
#plt.rcParams['figure.figsize'] = (12, 6)
#
#plt.plot(fpr, tpr, lw=lw, color=color,
# label='ROC fold %d (area = %0.2f)' % (i, roc_auc))
#
#i += 1
# Compute ROC curve and ROC area for each class
fpr = dict()
tpr = dict()
roc_auc = dict()
for i in range(n_classes):
fpr[i], tpr[i], _ = roc_curve(y[test][:, i], y_hat[:, i])
roc_auc[i] = auc(fpr[i], tpr[i])
# Plot of a ROC curve for a specific class
plt.figure()
lw = 2
plt.plot(fpr[2], tpr[2], color='darkorange',
lw=lw, label='ROC curve (area = %0.2f)' % roc_auc[2])
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic example')
plt.legend(loc="lower right")
plt.show()
#print("Accuracy Ratings across all iterations: {0}\n\n\
#Average Accuracy: {1}\n\n\
#Log Loss Values across all iterations: {2}\n\n\
#Average Log Loss: {3}\n".format(accuracy, round(sum(accuracy)/len(accuracy),5), logloss,round(sum(logloss)/len(logloss),5)))
print("Accuracy Ratings across all iterations: {0}\n\n\
Average Accuracy: {1}\n".format(accuracy, round(sum(accuracy)/len(accuracy),5)))
ytestnames = np.where(y[test] == 0,'NS',
np.where(y[test] == 1,'SA',
np.where(y[test] == 2,'SC',
np.where(y[test] == 3,'SD',
np.where(y[test] == 4,'SH',
'SI'
)
)
)
)
)
yhatnames = np.where(y_hat == 0,'NS',
np.where(y_hat == 1,'SA',
np.where(y_hat == 2,'SC',
np.where(y_hat == 3,'SD',
np.where(y_hat == 4,'SH',
'SI'
)
)
)
)
)
#print(set(list(y_hat)))
print("confusion matrix\n{0}\n".format(pd.crosstab(ytestnames, yhatnames, rownames = ['True'], colnames = ['Predicted'], margins = True)))
# Plot non-normalized confusion matrix
plt.figure()
plot_confusion_matrix(confusion_matrix(y[test], y_hat),
classes =["NS", "SA", "SC", "SD", "SH", "SI"],
normalize =True,
title ='Confusion matrix, with normalization')
return clf_pipe.named_steps['CLF'], accuracy'''
'''%%time
rfc_clf = OneVsRestClassifier(RandomForestClassifier(n_estimators = 15,
max_features = 14,
max_depth = 1000.0,
min_samples_split = 50,
min_samples_leaf = 25,
class_weight = "balanced",
n_jobs = -1,
random_state = seed)) # get object
rfc_clf, rfc_acc = compute_kfold_scores_Classification(rfc_clf, cols = fullColumns)'''
'''%%time
rfc_clf = OneVsRestClassifier(RandomForestClassifier(n_estimators = 15,
max_features = 14,
max_depth = 1000.0,
min_samples_split = 50,
min_samples_leaf = 25,
class_weight = "balanced",
n_jobs = -1,
random_state = seed)) # get object
y = OPMAnalysisDataNoFam["SEP"].values # get the labels we want
y = np.where(y == 'NS', 0,
np.where(y == 'SA', 1,
np.where(y == 'SC', 2,
np.where(y == 'SD', 3,
np.where(y == 'SH', 4,
5
)
)
)
)
)
X = OPMAnalysisDataNoFam[fullColumns].as_matrix()
# Binarize the output
#y_bin = label_binarize(OPMAnalysisDataNoFam["SEP"].values, list(OPMAnalysisDataNoFam.SEP.unique()))
n_classes = len(set(y))
#classifier = OneVsRestClassifier(svm.SVC(kernel='linear', probability=True,
# random_state=random_state))
#y_score = rfc_clf.fit(X[train], y[train]).decision_function(X_test)
#
colors = cycle(['cyan', 'indigo', 'seagreen', 'yellow', 'blue', 'darkorange', 'pink', 'darkred', 'dimgray', 'maroon', 'coral'])
#
#accuracy = []
#
#for (train, test), color in zip(cv.split(X, y), colors):
# probas_ = rfc_clf.fit(X[train], y[train]).predict_proba(X[test])
#
# #rfc_clf.fit(X[train],y[train]) # train object
# y_hat = rfc_clf.predict(X[test]) # get test set preditions
# #y_hat = rfc_clf.fit(X[train],y[train]).decision_function(X[test])
#
# fpr = dict()
# tpr = dict()
# roc_auc = dict()
# for i in range(n_classes):
# #fpr[i], tpr[i], _ = roc_curve(y[test][:, i], y_hat[:, i])
# print(len(probas_[:, i]))
# print(y[test])
# #fpr[i], tpr[i], thresholds = roc_curve(y[test][:, i], probas_[:, i])
# #roc_auc[i] = auc(fpr[i], tpr[i])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.5, random_state=seed)
probas_ = rfc_clf.fit(X_train, y_train).predict_proba(X_test)
#rfc_clf.fit(X[train],y[train]) # train object
#y_hat = rfc_clf.predict(X_test) # get test set preditions
#y_hat = rfc_clf.fit(X_train,y_train).decision_function(X_test)
fpr = dict()
tpr = dict()
roc_auc = dict()
for i in range(n_classes):
#fpr[i], tpr[i], _ = roc_curve(y[test][:, i], y_hat[:, i])
print(probas_[:, i])
print(y_test)
#fpr[i], tpr[i], thresholds = roc_curve(y[test][i], probas_[:, i])
#roc_auc[i] = auc(fpr[i], tpr[i])'''
As was done before, we assess weights across classes. Since stratification was performed previously, we have equal weights. Thus, we can ignore weighting in our binary classifications.
OPMClassWeights = class_weight.compute_class_weight("balanced", OPMAnalysisDataNoFamBinary["SEP"].drop_duplicates(), OPMAnalysisDataNoFamBinary["SEP"])
display(stratumProf.merge(pd.DataFrame({"Weight": OPMClassWeights, "SEP": OPMAnalysisDataNoFamBinary["SEP"].drop_duplicates()}),on="SEP", how="inner"))
We have chosen to utilize Stratified KFold Cross Validation for our classification analysis, with 5 folds. This means, that from our original sample size of 8002, each "fold" will save off approximately 20% as test observations utilizing the rest as training observations all while keeping the ratio of classes equal amongst customers and subscribers. This process will occur through 5 iterations, or folds, to allow us to cross validate our results amongst different test/train combinations. We have utilized a random_state seed equal to the length of the original sampled dataset to ensure reproducible results.
seed = len(OPMAnalysisDataNoFamBinary)
cv = StratifiedKFold(n_splits = 5, random_state = seed)
print(OPMAnalysisDataNoFamBinary.shape)
print(cv)
Max Depth The maximum depth (levels) in the tree. When a value is set, the tree may not split further once this level has been met regardless of how many nodes are in the leaf.
Max Features Number of features to consider when looking for a split.
Minimum Samples in Leaf Minimum number of samples required to be in a leaf node. Splits may not occur which cause the number of samples in a leaf to be less than this value. Too low a value here leads to overfitting the tree to train data.
Minimum Samples to Split Minimum number fo samples required to split a node. Care was taken during parameter tests to keep the ratio between Min Samples in Leaf and Min Samples to Split equal to that of the default values (1:2). This was done to allow an even 50/50 split on nodes which match the lowest granularity split criteria. similar to the min samples in leaf, too low a value here leads to overfitting the tree to train data.
n_estimators Number of Trees generated in the forest. Increasing the number of trees, in our models increased accuracy while decreasing performance. We tuned to provide output that completed all 10 iterations in under 10 minutes.
| max_depth | max_features | min_samples_leaf | min_samples_split | n_estimators |
|---|---|---|---|---|
| TBD | TBD | TBD | TBD | TBD |
%%time
def rfc_explorBinary(n_estimators,
max_features,
max_depth,
min_samples_split,
min_samples_leaf,
Data = OPMAnalysisDataNoFamBinary,
cols = PCList,
cv = cv,
seed = seed):
startTime = datetime.now()
y = Data["SEP"].values # get the labels we want
X = Data[cols].as_matrix()
rfc_clf = RandomForestClassifier(n_estimators=n_estimators, max_features = max_features, max_depth=max_depth, min_samples_split = min_samples_split, min_samples_leaf = min_samples_leaf, n_jobs=-1, random_state = seed) # get object
# setup pipeline to take PCA, then fit a clf model
clf_pipe = Pipeline(
[('minMaxScaler', MinMaxScaler()),
('CLF',rfc_clf)]
)
accuracy = cross_val_score(clf_pipe, X, y, cv=cv.split(X, y)) # this also can help with parallelism
MeanAccuracy = sum(accuracy)/len(accuracy)
accuracy = np.append(accuracy, MeanAccuracy)
endTime = datetime.now()
TotalTime = endTime - startTime
accuracy = np.append(accuracy, TotalTime)
#print(TotalTime)
#print(accuracy)
return accuracy
%%time
def rfc_explorBinary_w_PCA(n_estimators,
max_features,
max_depth,
min_samples_split,
min_samples_leaf,
PCA,
Data = OPMAnalysisDataNoFamBinary,
cv = cv,
seed = seed):
startTime = datetime.now()
y = Data["SEP"].values # get the labels we want
X = Data.drop("SEP", axis=1).as_matrix()
rfc_clf = RandomForestClassifier(n_estimators=n_estimators, max_features = max_features, max_depth=max_depth, min_samples_split = min_samples_split, min_samples_leaf = min_samples_leaf, n_jobs=-1, random_state = seed) # get object
# setup pipeline to take PCA, then fit a clf model
clf_pipe = Pipeline(
[('minMaxScaler', MinMaxScaler()),
('PCA', PCA),
('CLF',rfc_clf)]
)
accuracy = cross_val_score(clf_pipe, X, y, cv=cv.split(X, y)) # this also can help with parallelism
MeanAccuracy = sum(accuracy)/len(accuracy)
accuracy = np.append(accuracy, MeanAccuracy)
endTime = datetime.now()
TotalTime = endTime - startTime
accuracy = np.append(accuracy, TotalTime)
#print(TotalTime)
#print(accuracy)
return accuracy
%%time
acclist = []
fullColumns = list(OPMAnalysisDataNoFamBinary.columns)
for i in fullColumns:
if i == "SEP": fullColumns.remove(i)
n_estimators = [10 , 10 , 10 , 10 , 10 , 10 , 10 , 10 , 10 , 10 , 10 , 10 , 10 , 10 , 10 , 10 , 10 , 10 , 10 , 10 , 10 , 10 , 10 , 15 , 20 , 30 , 50 ]
max_features = ['auto', 'auto' , 'auto', 'auto', 'auto', 'auto', 'auto', 5 , 10 , 15 , 20 , None , 'auto', 'auto', 'auto', 'auto', 'auto', 'auto', 'auto', 'auto', 'auto', 'auto', 'auto', 'auto', 'auto', 'auto', 'auto']
max_depth = [None , None , None , None , None , None , None , None , None , None , None , None , 10 , 15 , 20 , 25 , 30 , 17 , 18 , 19 , 21 , 22 , 23 , 20 , 20 , 20 , 20 ]
min_samples_split = [2 , 8 , 12 , 18 , 20 , 24 , 36 , 18 , 18 , 18 , 18 , 18 , 18 , 18 , 18 , 18 , 18 , 18 , 18 , 18 , 18 , 18 , 18 , 18 , 18 , 18 , 18 ]
min_samples_leaf = [1 , 4 , 6 , 9 , 10 , 12 , 18 , 9 , 9 , 9 , 9 , 9 , 9 , 9 , 9 , 9 , 9 , 9 , 9 , 9 , 9 , 9 , 9 , 9 , 9 , 9 , 9 ]
##Model with all Raw Scaled Features
for i in range(0,len(n_estimators)):
acclist.append(rfc_explorBinary(n_estimators = n_estimators[i],
max_features = max_features[i],
max_depth = max_depth[i],
min_samples_split = min_samples_split[i],
min_samples_leaf = min_samples_leaf[i],
cols = fullColumns
)
)
rfcdf = pd.DataFrame(pd.concat([pd.DataFrame({ "ModelVersion": "All Raw Features",
"n_estimators": n_estimators,
"max_features": max_features,
"max_depth": max_depth,
"min_samples_split": min_samples_split,
"min_samples_leaf": min_samples_leaf
}),
pd.DataFrame(acclist)], axis = 1).reindex())
rfcdf.columns = ['ModelVersion', 'max_depth', 'max_features', 'min_samples_leaf','min_samples_split', 'n_estimators', 'Iteration 0', 'Iteration 1', 'Iteration 2', 'Iteration 3', 'Iteration 4', 'MeanAccuracy', 'RunTime']
display(rfcdf)
del rfcdf, acclist
acclist = []
n_estimators = [10 , 10 , 10 , 10 , 10 , 10 , 10 , 10 , 10 , 10 , 10 , 10 , 10 , 10 , 10 , 10 , 10 , 10 , 10 , 10 , 10 , 10 , 10 , 15 , 20 , 30 , 50 ]
max_features = ['auto', 'auto' , 'auto', 'auto', 'auto', 'auto', 'auto', 5 , 10 , 15 , 20 , None , 5 , 5 , 5 , 5 , 5 , 5 , 5 , 5 , 5 , 5 , 5 , 5 , 5 , 5 , 5 ]
max_depth = [None , None , None , None , None , None , None , None , None , None , None , None , 10 , 15 , 20 , 25 , 30 , 17 , 18 , 19 , 21 , 22 , 23 , 20 , 20 , 20 , 20 ]
min_samples_split = [2 , 8 , 12 , 18 , 20 , 24 , 36 , 18 , 18 , 18 , 18 , 18 , 18 , 18 , 18 , 18 , 18 , 18 , 18 , 18 , 18 , 18 , 18 , 18 , 18 , 18 , 18 ]
min_samples_leaf = [1 , 4 , 6 , 9 , 10 , 12 , 18 , 9 , 9 , 9 , 9 , 9 , 9 , 9 , 9 , 9 , 9 , 9 , 9 , 9 , 9 , 9 , 9 , 9 , 9 , 9 , 9 ]
## Model with only top 15 raw Scaled Principal Features
for i in range(0,len(n_estimators)):
acclist.append(rfc_explorBinary(n_estimators = n_estimators[i],
max_features = max_features[i],
max_depth = max_depth[i],
min_samples_split = min_samples_split[i],
min_samples_leaf = min_samples_leaf[i]
)
)
rfcdf = pd.DataFrame(pd.concat([pd.DataFrame({ "ModelVersion": "Top 15 Raw from PC",
"n_estimators": n_estimators,
"max_features": max_features,
"max_depth": max_depth,
"min_samples_split": min_samples_split,
"min_samples_leaf": min_samples_leaf
}),
pd.DataFrame(acclist)], axis = 1).reindex())
rfcdf.columns = ['ModelVersion', 'max_depth', 'max_features', 'min_samples_leaf','min_samples_split', 'n_estimators', 'Iteration 0', 'Iteration 1', 'Iteration 2', 'Iteration 3', 'Iteration 4', 'MeanAccuracy', 'RunTime']
display(rfcdf)
del rfcdf, acclist
### Model with PCA
acclist = []
n_estimators = [10 , 10 , 10 , 10 , 10 , 10 , 10 , 10 , 10 , 10 , 10 , 10 , 10 , 10 , 10 , 10 , 10 , 10 , 10 , 10 , 10 , 10 , 10 , 15 , 20 , 30 , 50 ]
max_features = ['auto', 'auto' , 'auto', 'auto', 'auto', 'auto', 'auto', 5 , 10 , 15 , 20 , None , 'auto', 'auto', 'auto', 'auto', 'auto', 'auto', 'auto', 'auto', 'auto', 'auto', 'auto', 'auto', 'auto', 'auto', 'auto']
max_depth = [None , None , None , None , None , None , None , None , None , None , None , None , 10 , 15 , 20 , 25 , 30 , 17 , 18 , 19 , 21 , 22 , 23 , 20 , 20 , 20 , 20 ]
min_samples_split = [2 , 8 , 12 , 18 , 20 , 24 , 36 , 8 , 8 , 8 , 8 , 8 , 8 , 8 , 8 , 8 , 8 , 8 , 8 , 8 , 8 , 8 , 8 , 8 , 8 , 8 , 8 ]
min_samples_leaf = [1 , 4 , 6 , 9 , 10 , 12 , 18 , 4 , 4 , 4 , 4 , 4 , 4 , 4 , 4 , 4 , 4 , 4 , 4 , 4 , 4 , 4 , 4 , 4 , 4 , 4 , 4 ]
for i in range(0,len(n_estimators)):
acclist.append(rfc_explorBinary_w_PCA(n_estimators = n_estimators[i],
max_features = max_features[i],
max_depth = max_depth[i],
min_samples_split = min_samples_split[i],
min_samples_leaf = min_samples_leaf[i],
PCA = PCA(n_components=22, svd_solver='randomized', random_state = seed)
)
)
rfcdf = pd.DataFrame(pd.concat([pd.DataFrame({ "ModelVersion": "With PCA",
"n_estimators": n_estimators,
"max_features": max_features,
"max_depth": max_depth,
"min_samples_split": min_samples_split,
"min_samples_leaf": min_samples_leaf
}),
pd.DataFrame(acclist)], axis = 1).reindex())
rfcdf.columns = ['ModelVersion', 'max_depth', 'max_features', 'min_samples_leaf','min_samples_split', 'n_estimators', 'Iteration 0', 'Iteration 1', 'Iteration 2', 'Iteration 3', 'Iteration 4', 'MeanAccuracy', 'RunTime']
display(rfcdf)
#'Iteration 5', 'Iteration 6', 'Iteration 7', 'Iteration 8', 'Iteration 9',
We have created a function to be re-used for our cross-validation Accuracy Scores. Inputs of PCA components, Model CLF object, original sample data, and a CV containing our test/train splits allow us to easily produce an array of Accuracy Scores for the different permutations of models tested. A XXXXXXTBDXXXXX plot is also displayed depicting a view of the misclassification values for each iteration. Finally, a confusion matrix is displayed for the last test/train iteration for further interpretation on results.
def plot_confusion_matrixBinary(cm, classes,
normalize=False,
title='Confusion matrix',
cmap=plt.cm.Blues):
"""
This function prints and plots the confusion matrix.
Normalization can be applied by setting `normalize=True`.
"""
plt.rcParams['figure.figsize'] = (18, 6)
plt.rcParams.update({'font.size': 16})
plt.rc('xtick', labelsize=18)
plt.rc('ytick', labelsize=18)
plt.imshow(cm, interpolation='nearest', cmap=cmap)
plt.title(title, fontsize = 18)
plt.colorbar()
tick_marks = np.arange(len(classes))
plt.xticks(tick_marks, classes, rotation=45)
plt.yticks(tick_marks, classes)
if normalize:
cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
print("Normalized confusion matrix")
else:
print('Confusion matrix, without normalization')
print(cm)
thresh = cm.max() / 2.
for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
plt.text(j, i, round(cm[i, j],2),
horizontalalignment="center",
color="white" if cm[i, j] > thresh else "black")
plt.tight_layout()
plt.ylabel('True label', fontsize = 18)
plt.xlabel('Predicted label', fontsize = 18)
plt.show()
def plot_ROC_curve(X, y, mean_tpr, mean_fpr, cv = cv, ):
plt.rcParams['figure.figsize'] = (12, 6)
lw = 2
plt.plot([0, 1], [0, 1], linestyle='--', lw=lw, color='k',
label='Luck')
mean_tpr /= cv.get_n_splits(X, y)
mean_tpr[-1] = 1.0
mean_auc = auc(mean_fpr, mean_tpr)
plt.plot(mean_fpr, mean_tpr, color='g', linestyle='--',
label='Mean ROC (area = %0.2f)' % mean_auc, lw=lw)
plt.xlim([-0.05, 1.05])
plt.ylim([-0.05, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic (ROC) Curve')
plt.legend(loc="lower right")
plt.show()
%%time
def compute_kfold_scores_ClassificationBinary( clf,
PCA = "No",
Data = OPMAnalysisDataNoFamBinary,
cols = PCList,
cv = cv):
y = Data["SEP"].values # get the labels we want
y = np.where(y == 'NS', 0, 1) # NS = 0; SC = 1
X = Data[cols].as_matrix()
# Run classifier with cross-validation and plot ROC curves
# setup pipeline to take PCA, then fit a clf model
if(PCA == "No"):
clf_pipe = Pipeline(
[('minMaxScaler', MinMaxScaler()),
('CLF',clf)]
)
else:
clf_pipe = Pipeline(
[('minMaxScaler', MinMaxScaler()),
('PCA', PCA),
('CLF',clf)]
)
colors = cycle(['cyan', 'indigo', 'seagreen', 'yellow', 'blue', 'darkorange', 'pink', 'darkred', 'dimgray', 'maroon', 'coral'])
mean_tpr = 0.0
mean_fpr = np.linspace(0, 1, 100)
lw = 2
i = 0
accuracy = []
#logloss = []
for (train, test), color in zip(cv.split(X, y), colors):
clf_pipe.fit(X[train],y[train]) # train object
y_hat = clf_pipe.predict(X[test]) # get test set preditions
probas_ = clf_pipe.fit(X[train], y[train]).predict_proba(X[test])
# Compute ROC curve and area the curve
fpr, tpr, thresholds = roc_curve(y[test], probas_[:, 1])
mean_tpr += interp(mean_fpr, fpr, tpr)
mean_tpr[0] = 0.0
roc_auc = auc(fpr, tpr)
plt.rcParams['figure.figsize'] = (12, 6)
plt.plot(fpr, tpr, lw=lw, color=color,
label='ROC fold %d (area = %0.2f)' % (i, roc_auc))
i += 1
plot_ROC_curve(X, y, mean_tpr, mean_fpr)
#logloss.append(round(l,5))
#print("Accuracy Ratings across all iterations: {0}\n\n\
#Average Accuracy: {1}\n\n\
#Log Loss Values across all iterations: {2}\n\n\
#Average Log Loss: {3}\n".format(accuracy, round(sum(accuracy)/len(accuracy),5), logloss,round(sum(logloss)/len(logloss),5)))
for (train, test), color in zip(cv.split(X, y), colors):
clf_pipe.fit(X[train],y[train]) # train object
y_hat = clf_pipe.predict(X[test]) # get test set preditions
a = float(mt.accuracy_score(y[test],y_hat))
#l = float(mt.log_loss(y[test], y_hat))
accuracy.append(round(a,5))
ytestnames = np.where(y[test] == 0,'NS','SC')
yhatnames = np.where(y_hat == 0,'NS', 'SC')
#print(set(list(y_hat)))
print("confusion matrix\n{0}\n".format(pd.crosstab(ytestnames, yhatnames, rownames = ['True'], colnames = ['Predicted'], margins = True)))
# Plot non-normalized confusion matrix
plt.figure()
plot_confusion_matrixBinary(confusion_matrix(y[test], y_hat),
classes =["NS", "SC"],
normalize =True,
title ='Confusion matrix, with normalization')
print("Accuracy Ratings across all iterations: {0}\n\n\
Average Accuracy: {1}\n".format(accuracy, round(sum(accuracy)/len(accuracy),5)))
return clf_pipe.named_steps['CLF'], accuracy
%%time
rfc_clf = RandomForestClassifier(n_estimators = 15,
max_features = 'auto',
max_depth = 20.0,
min_samples_split = 18,
min_samples_leaf = 9,
n_jobs = -1,
random_state = seed) # get object
rfc_clf, rfc_acc = compute_kfold_scores_ClassificationBinary(rfc_clf,
##PCA = PCA(n_components=22, svd_solver='randomized', random_state = seed),
cols = fullColumns)
%%time
import os
from sklearn import tree
import pydotplus
import six
from sklearn.tree import export_graphviz
from IPython.display import SVG
i_tree = 0
for tree_in_forest in rfc_clf.estimators_:
svgData = tree.export_graphviz(tree_in_forest,
feature_names=fullColumns,
class_names=["NS", "SD"],
filled=True,
#rounded=True,
rotate = True,
label = 'All',
out_file=None)
graph=pydotplus.graph_from_dot_data(svgData)
if not os.path.exists('images'):
os.makedirs('images')
graph.write_svg('images/tree'+ str(i_tree) +'.svg')
i_tree = i_tree + 1
SVG(filename='images/tree0.svg')
Algorithm
Options include "Ball Tree" and "KD Tree".
Our findings, were that the Ball Tree algorithm was considerably less efficient to produce results for all 10 iterations in comparison to the KD Tree Algorithm.
Leaf Size
The size for leaf nodes in the KNN Tree.
Number of Neighbors
After 24 iterations of modifying the above parameters, we land on a final winner based on the highest average Accuracy value across all iterations. Average Accuracy values in our 10 test/train iterations ranged from 66.5216 % from the worst parameter inputs of the Ball_Tree Algorith to a value of 69.5528 % in best tuned KNN Classification model fit. We have chosen to utilize the best input for KD tree, although losing an improvement of .0004 % due to the cost(slower runtime of 07 Minutes 25 Seconds through 10 iterations) of fitting the model as Ball Tree. Parameter inputs for the final K Nearest Neighbor Classification model with the KD Tree Algorithm are as follows:
| algorithm | leaf_size | n_neighbors |
|---|---|---|
| kd_tree | 50 | 150 |
%%time
def knn_explorBinary_w_PCA(n_neighbors,
algorithm ,
leaf_size,
PCA,
Data = OPMAnalysisDataNoFamBinary,
cv = cv,
seed = seed):
startTime = datetime.now()
y = Data["SEP"].values # get the labels we want
X = Data.drop("SEP", axis=1).as_matrix()
knn_clf = KNeighborsClassifier(n_neighbors = n_neighbors, algorithm = algorithm, leaf_size = leaf_size, n_jobs=-1) # get object
# setup pipeline to take PCA, then fit a clf model
clf_pipe = Pipeline(
[('minMaxScaler', MinMaxScaler()),
('PCA', PCA),
('CLF',knn_clf)]
)
accuracy = cross_val_score(clf_pipe, X, y, cv=cv.split(X, y)) # this also can help with parallelism
MeanAccuracy = sum(accuracy)/len(accuracy)
accuracy = np.append(accuracy, MeanAccuracy)
endTime = datetime.now()
TotalTime = endTime - startTime
accuracy = np.append(accuracy, TotalTime)
#print(TotalTime)
#print(accuracy)
return accuracy
%%time
def knn_explorBinary(n_neighbors,
algorithm ,
leaf_size,
Data = OPMAnalysisDataNoFamBinary,
cols = PCList,
cv = cv,
seed = seed):
startTime = datetime.now()
y = Data["SEP"].values # get the labels we want
if ("SEP" in cols): X = Data[cols].drop("SEP", axis=1).as_matrix()
else: X = Data[cols]
knn_clf = KNeighborsClassifier(n_neighbors = n_neighbors, algorithm = algorithm, leaf_size = leaf_size, n_jobs=-1) # get object
# setup pipeline to take PCA, then fit a clf model
clf_pipe = Pipeline(
[('minMaxScaler', MinMaxScaler()),
('CLF',knn_clf)]
)
accuracy = cross_val_score(clf_pipe, X, y, cv=cv.split(X, y)) # this also can help with parallelism
MeanAccuracy = sum(accuracy)/len(accuracy)
accuracy = np.append(accuracy, MeanAccuracy)
endTime = datetime.now()
TotalTime = endTime - startTime
accuracy = np.append(accuracy, TotalTime)
#print(TotalTime)
#print(accuracy)
return accuracy
%%time
###Full Columns
acclist = []
n_neighbors = [5 , 10 , 15 , 20 , 30 , 40 , 50 , 100 , 150 , 200 , 250 , 200 , 200 , 200 , 200 , 200 , 200 ]
algorithm = 'kd_tree'
leaf_size = [30 , 30 , 30 , 30 , 30 , 30 , 30 , 30 , 30 , 30 , 30 , 5 , 10 , 20 , 50 , 100 , 300 ]
for i in range(0,len(n_neighbors)):
acclist.append(knn_explorBinary(n_neighbors = n_neighbors[i],
algorithm = algorithm,
leaf_size = leaf_size[i],
cols = fullColumns
)
)
rfcdf = pd.DataFrame(pd.concat([pd.DataFrame({
"ModelVersion": "Full Raw Columns",
"n_neighbors": n_neighbors,
"algorithm": algorithm,
"leaf_size": leaf_size
}),
pd.DataFrame(acclist)], axis = 1).reindex())
rfcdf.columns = ['ModelVersion','algorithm', 'leaf_size','n_neighbors', 'Iteration 0', 'Iteration 1', 'Iteration 2', 'Iteration 3', 'Iteration 4', 'MeanAccuracy', 'RunTime']
display(rfcdf)
acclist = []
n_neighbors = [5 , 10 , 15 , 20 , 30 , 40 , 50 , 100 , 150 , 200 , 250 , 200 , 200 , 200 , 200 , 200 , 200 ]
algorithm = 'ball_tree'
leaf_size = [30 , 30 , 30 , 30 , 30 , 30 , 30 , 30 , 30 , 30 , 30 , 5 , 10 , 20 , 50 , 100 , 300 ]
for i in range(0,len(n_neighbors)):
acclist.append(knn_explorBinary(n_neighbors = n_neighbors[i],
algorithm = algorithm,
leaf_size = leaf_size[i],
cols = fullColumns
)
)
rfcdf = pd.DataFrame(pd.concat([pd.DataFrame({
"ModelVersion": "Full Raw Columns",
"n_neighbors": n_neighbors,
"algorithm": algorithm,
"leaf_size": leaf_size
}),
pd.DataFrame(acclist)], axis = 1).reindex())
rfcdf.columns = ['ModelVersion','algorithm', 'leaf_size','n_neighbors', 'Iteration 0', 'Iteration 1', 'Iteration 2', 'Iteration 3', 'Iteration 4', 'MeanAccuracy', 'RunTime']
display(rfcdf)
###Reduced Columns
acclist = []
n_neighbors = [5 , 10 , 15 , 20 , 30 , 40 , 50 , 100 , 150 , 200 , 250 , 300 , 350 , 400 , 150 , 150 , 150 , 150 , 150 , 150 ]
algorithm = 'kd_tree'
leaf_size = [30 , 30 , 30 , 30 , 30 , 30 , 30 , 30 , 30 , 30 , 30 , 30 , 30 , 30 , 5 , 10 , 20 , 50 , 100 , 150 ]
for i in range(0,len(n_neighbors)):
acclist.append(knn_explorBinary(n_neighbors = n_neighbors[i],
algorithm = algorithm,
leaf_size = leaf_size[i]
)
)
rfcdf = pd.DataFrame(pd.concat([pd.DataFrame({
"ModelVersion": "Reduced Raw Columns",
"n_neighbors": n_neighbors,
"algorithm": algorithm,
"leaf_size": leaf_size
}),
pd.DataFrame(acclist)], axis = 1).reindex())
rfcdf.columns = ['ModelVersion','algorithm', 'leaf_size','n_neighbors', 'Iteration 0', 'Iteration 1', 'Iteration 2', 'Iteration 3', 'Iteration 4', 'MeanAccuracy', 'RunTime']
display(rfcdf)
acclist = []
n_neighbors = [5 , 10 , 15 , 20 , 30 , 40 , 50 , 100 , 150 , 200 , 250 , 300 , 350 , 400 , 150 , 150 , 150 , 150 , 150 , 150 ]
algorithm = 'ball_tree'
leaf_size = [30 , 30 , 30 , 30 , 30 , 30 , 30 , 30 , 30 , 30 , 30 , 30 , 30 , 30 , 5 , 10 , 20 , 50 , 100 , 150 ]
for i in range(0,len(n_neighbors)):
acclist.append(knn_explorBinary(n_neighbors = n_neighbors[i],
algorithm = algorithm,
leaf_size = leaf_size[i]
)
)
rfcdf = pd.DataFrame(pd.concat([pd.DataFrame({
"ModelVersion": "Reduced Raw Columns",
"n_neighbors": n_neighbors,
"algorithm": algorithm,
"leaf_size": leaf_size
}),
pd.DataFrame(acclist)], axis = 1).reindex())
rfcdf.columns = ['ModelVersion','algorithm', 'leaf_size','n_neighbors', 'Iteration 0', 'Iteration 1', 'Iteration 2', 'Iteration 3', 'Iteration 4', 'MeanAccuracy', 'RunTime']
display(rfcdf)
#### WITH PCA
acclist = []
n_neighbors = [5 , 10 , 15 , 20 , 30 , 40 , 50 , 100 , 150 , 200 , 250 , 150 , 150 , 150 , 150 , 150 , 150 , 150 ]
algorithm = ['kd_tree' ,'kd_tree' ,'kd_tree' ,'kd_tree' ,'kd_tree' ,'kd_tree' ,'kd_tree' ,'kd_tree' ,'kd_tree' ,'kd_tree' ,'kd_tree' ,'kd_tree' ,'kd_tree' ,'kd_tree' ,'kd_tree' ,'kd_tree' ,'kd_tree' ,'kd_tree']
leaf_size = [30 , 30 , 30 , 30 , 30 , 30 , 30 , 30 , 30 , 30 , 30 , 5 , 10 , 20 , 50 , 100 , 150 , 300 ]
for i in range(0,len(n_neighbors)):
acclist.append(knn_explorBinary_w_PCA(n_neighbors = n_neighbors[i],
algorithm = algorithm[i],
leaf_size = leaf_size[i],
PCA = PCA(n_components=22, svd_solver='randomized', random_state = seed)
)
)
rfcdf = pd.DataFrame(pd.concat([pd.DataFrame({
"ModelVersion": "With PCA",
"n_neighbors": n_neighbors,
"algorithm": algorithm,
"leaf_size": leaf_size
}),
pd.DataFrame(acclist)], axis = 1).reindex())
rfcdf.columns = ['ModelVersion','algorithm', 'leaf_size','n_neighbors', 'Iteration 0', 'Iteration 1', 'Iteration 2', 'Iteration 3', 'Iteration 4', 'MeanAccuracy', 'RunTime']
display(rfcdf)
acclist = []
n_neighbors = [5 , 10 , 15 , 20 , 30 , 40 , 50 , 100 , 150 , 200 , 250 , 150 , 150 , 150 , 150 , 150 , 150 , 150 ]
algorithm = ['ball_tree','ball_tree' ,'ball_tree' ,'ball_tree' ,'ball_tree' ,'ball_tree' ,'ball_tree' ,'ball_tree' ,'ball_tree' ,'ball_tree' ,'ball_tree' ,'ball_tree' ,'ball_tree' ,'ball_tree' ,'ball_tree' ,'ball_tree' ,'ball_tree' ,'ball_tree' ]
leaf_size = [30 , 30 , 30 , 30 , 30 , 30 , 30 , 30 , 30 , 30 , 30 , 5 , 10 , 20 , 50 , 100 , 150 , 300 ]
for i in range(0,len(n_neighbors)):
acclist.append(knn_explorBinary_w_PCA(n_neighbors = n_neighbors[i],
algorithm = algorithm[i],
leaf_size = leaf_size[i],
PCA = PCA(n_components=22, svd_solver='randomized', random_state = seed)
)
)
rfcdf = pd.DataFrame(pd.concat([pd.DataFrame({
"ModelVersion": "With PCA",
"n_neighbors": n_neighbors,
"algorithm": algorithm,
"leaf_size": leaf_size
}),
pd.DataFrame(acclist)], axis = 1).reindex())
rfcdf.columns = ['ModelVersion','algorithm', 'leaf_size','n_neighbors', 'Iteration 0', 'Iteration 1', 'Iteration 2', 'Iteration 3', 'Iteration 4', 'MeanAccuracy', 'RunTime']
display(rfcdf)
%%time
knn_clf = KNeighborsClassifier(n_neighbors = 250, algorithm = 'kd_tree',leaf_size = 30, n_jobs=-1) # get object
knn_clf, knn_acc = compute_kfold_scores_ClassificationBinary(clf = knn_clf)
We have chosen to manipulate the cost variable (C) within our logistic regression analyzing accuracies at {1.0, .01, .05, 5}. This parameter is essentially an inverted regularization strength equal to 1/lambda per scikit-learn class function code (lambda being the actual regularization item). Therefore, the smaller the cost value the stronger the regularization.
mapping = {'NS':0, 'SC':1}
y = OPMAnalysisDataNoFamBinary.replace({'SEP': mapping})
y = y.SEP
%%R -i OPMAnalysisDataNoFamBinary,fullColumns,y
install.packages("car")
require(car)
str(OPMAnalysisDataNoFamBinary)
print(unlist(fullColumns))
print(paste("# of SEP observations = ", length(y)))
print(paste("SEP type = ", unique(y)))
print(paste("SEP class = ", class(y)))
LOCTYP_1 and PPTYP_1 have only single level and need removed
%R sapply(OPMAnalysisDataNoFamBinary, function(x) length(unique(x[!is.na(x)])))
%R sapply(OPMAnalysisDataNoFamBinary[,-c(95,96)], function(x) length(unique(x[!is.na(x)])))
%R OPMAnalysisDataNoFamBinary[,-c(95,96)]
Perform logistic regression with intent to extract only most important features
%%R
vars <- unlist(fullColumns)
vars <- vars[-c(94,95)]
#print(vars)
fla <- paste("SEP ~", paste(vars, collapse="+"))
fla <- as.formula(fla)
OPMAnalysisDataNoFamBinary$SEP <- as.vector(y)
BinLogit <- glm(fla, data = OPMAnalysisDataNoFamBinary, family = "binomial")
summary(BinLogit)
%%R
alias(BinLogit)
%%R
tmp <- alias(BinLogit)$Complete
print(attributes(tmp))
aliased <- dimnames(tmp)[[1]]
%%R
print("Following attributes will be dropped from model due to multicollinearity:")
aliased <- ifelse(grepl('[[:digit:]]$', aliased), substr(aliased, 1, nchar(aliased)-1), aliased)
print(aliased)
paste(as.character(length(vars) - length(vars[!vars %in% c(aliased)])), "attributes removed from model input")
%%R
runLogit <- function(less){
#vars <- vars[!(vars %in% c(less))]
fla <- paste("SEP ~", paste(vars, collapse="+"))
fla <- as.formula(fla)
binLog <- glm(fla, data = OPMAnalysisDataNoFamBinary, family = "binomial")
return(binLog)
}
vars <- vars[!(vars %in% c(aliased))]
BinLogit2 <- runLogit(aliased)
summary(BinLogit2)
%%R
#data.frame(summary(BinLogit)$coef[summary(BinLogit)$coef[,4] <= .05, 4]) #Review coefficients of p-value less than 0.05
LogitCoeffs <- data.frame(summary(BinLogit2)$coef[-1,4]) #Ignore Intercept and only look at p-values
#LogitCoeffs[LogitCoeffs$`summary.BinLogit..coef..1..4.` == max(LogitCoeffs$`summary.BinLogit..coef..1..4.`),]
cbind(rownames(LogitCoeffs)[LogitCoeffs$`summary.BinLogit2..coef..1..4.` == max(LogitCoeffs$`summary.BinLogit2..coef..1..4.`)],
max(LogitCoeffs))
%%R
runVifs <- function(logit){
tmp <- as.data.frame(vif(logit))
colnames(tmp) <- "VIF"
scipen.default <- getOption("scipen")
options(scipen=999)
print(tmp)
options(scipen=scipen.default)
return(tmp)
}
vifs.BinLogit2 <- runVifs(BinLogit2)
%%R
remove <- rownames(vifs.BinLogit2)[vifs.BinLogit2$VIF > 1000]
print(paste(length(rownames(vifs.BinLogit2)) - sum(!(rownames(vifs.BinLogit2) %in% remove)), "more attributes removed from model input"))
%%R
print(remove)
%%R
vars <- vars[!(vars %in% c(remove))]
BinLogit3 <- runLogit(remove)
summary(BinLogit3)
%%R
vifs.BinLogit3 <- runVifs(BinLogit3)
%%time
def lr_explorBinary( cost,
Data = OPMAnalysisDataNoFamBinary,
cols = PCList,
cv = cv,
seed = seed):
startTime = datetime.now()
y = Data["SEP"].values # get the labels we want
if ("SEP" in cols): X = Data[cols].drop("SEP", axis=1).as_matrix()
else: X = Data[cols]
lr_clf = LogisticRegression(penalty='l2', C=cost, class_weight=None, random_state=seed) # get object
# setup pipeline to take PCA, then fit a clf model
clf_pipe = Pipeline(
[('minMaxScaler',MinMaxScaler()),
('CLF',lr_clf)]
)
accuracy = cross_val_score(clf_pipe, X, y, cv=cv.split(X, y)) # this also can help with parallelism
MeanAccuracy = sum(accuracy)/len(accuracy)
accuracy = np.append(accuracy, MeanAccuracy)
endTime = datetime.now()
TotalTime = endTime - startTime
accuracy = np.append(accuracy, TotalTime)
return accuracy
%%time
def lr_explorBinary_w_PCA(cost,
PCA,
Data = OPMAnalysisDataNoFamBinary,
cv = cv,
seed = seed):
startTime = datetime.now()
y = Data["SEP"].values # get the labels we want
X = Data.drop("SEP", axis=1).as_matrix()
lr_clf = LogisticRegression(penalty='l2', C=cost, class_weight=None, random_state=seed) # get object
# setup pipeline to take PCA, then fit a clf model
clf_pipe = Pipeline(
[('minMaxScaler',MinMaxScaler()),
('PCA',PCA),
('CLF',lr_clf)]
)
accuracy = cross_val_score(clf_pipe, X, y, cv=cv.split(X, y)) # this also can help with parallelism
MeanAccuracy = sum(accuracy)/len(accuracy)
accuracy = np.append(accuracy, MeanAccuracy)
endTime = datetime.now()
TotalTime = endTime - startTime
accuracy = np.append(accuracy, TotalTime)
return accuracy
%%time
##Full Columns
acclist = []
cost = [.00000001, .0001, .001, .01, .05, 1.0, 2.0, 3.0, 4.0, 5.0]
for i in range(0,len(cost)):
acclist.append(lr_explorBinary(cost = cost[i],
cols = fullColumns))
LRdf = pd.DataFrame(pd.concat([pd.DataFrame({
"ModelVersion": "All Raw Features",
"Cost": cost
})[["ModelVersion", "Cost"]],
pd.DataFrame(acclist)], axis = 1).reindex())
LRdf.columns = ['ModelVersion','Cost', 'Iteration 0', 'Iteration 1', 'Iteration 2', 'Iteration 3', 'Iteration 4', 'MeanAccuracy', 'RunTime']
display(LRdf)
##Reduced Columns
acclist = []
cost = [.00000001, .0001, .001, .01, .05, 1.0, 2.0, 3.0, 4.0, 5.0]
for i in range(0,len(cost)):
acclist.append(lr_explorBinary(cost = cost[i]))
LRdf = pd.DataFrame(pd.concat([pd.DataFrame({
"ModelVersion": "Top 15 from PCA Raw Features",
"Cost": cost
})[["ModelVersion", "Cost"]],
pd.DataFrame(acclist)], axis = 1).reindex())
LRdf.columns = ['ModelVersion','Cost', 'Iteration 0', 'Iteration 1', 'Iteration 2', 'Iteration 3', 'Iteration 4', 'MeanAccuracy', 'RunTime']
display(LRdf)
##With PCA
acclist = []
cost = [.00000001, .0001, .001, .01, .05, 1.0, 2.0, 3.0, 4.0, 5.0]
for i in range(0,len(cost)):
acclist.append(lr_explorBinary_w_PCA(cost = cost[i],
PCA = PCA(n_components=23, svd_solver='randomized', random_state = seed)))
LRdf = pd.DataFrame(pd.concat([pd.DataFrame({
"ModelVersion": "With PCA",
"Cost": cost
})[["ModelVersion", "Cost"]],
pd.DataFrame(acclist)], axis = 1).reindex())
LRdf.columns = ['ModelVersion','Cost', 'Iteration 0', 'Iteration 1', 'Iteration 2', 'Iteration 3', 'Iteration 4', 'MeanAccuracy', 'RunTime']
display(LRdf)
"""
##Significant Column List from Manual Tuning in R
acclist = []
cost = [.00000001, .0001, .001, .01, .05, 1.0, 2.0, 3.0, 4.0, 5.0]
for i in range(0,len(cost)):
acclist.append(lr_explorBinary(cost = cost[i],
cols = LRSigCols))
LRdf = pd.DataFrame(pd.concat([pd.DataFrame({
"ModelVersion": "Manual Significant Features",
"Cost": cost
})[["ModelVersion", "Cost"]],
pd.DataFrame(acclist)], axis = 1).reindex())
LRdf.columns = ['ModelVersion','Cost', 'Iteration 0', 'Iteration 1', 'Iteration 2', 'Iteration 3', 'Iteration 4', 'MeanAccuracy', 'RunTime']
display(LRdf)
"""
lr_clf = LogisticRegression(penalty='l2', C=.0001, class_weight=None, random_state=seed) # get object
lr_clf, lr_acc = compute_kfold_scores_ClassificationBinary(clf = lr_clf,
cols = fullColumns)
print(lr_clf.coef_[0])
coef = pd.Series(lr_clf.coef_[0], index=fullColumns)
maxcoef = pd.Series(pd.DataFrame(abs(coef).sort_values(ascending=False).head(20)).index)
weightsplot = pd.Series(coef, index=maxcoef)
weightsplot.plot(title = "Logistic Regression Coefficients", kind='bar', color = 'Tomato')
%%time
if os.path.isfile(PickleJarPath+"/OPMAnalysisDataNoFamAdminBinary.pkl"):
print("Found the File! Loading Pickle Now!")
OPMAnalysisDataNoFamAdminBinary = unpickleObject("OPMAnalysisDataNoFamAdminBinary")
else:
OPMAnalysisDataNoFamAdminBinary = SampledOPMDataAdmin.copy()
cols = ["GENDER",
"DATECODE",
"QTR",
"COUNT",
"AGYTYPT",
"AGYT",
"AGYSUB",
"AGYSUBT",
"QTR",
"AGELVLT",
"LOSLVL",
"LOSLVLT",
"LOCTYPT",
"LOCT",
"OCCTYP",
"OCCTYPT",
"OCCFAM",
"OCCFAMT",
"OCC",
"OCCT",
"PATCO",
"PPGRD",
"PATCOT",
"PPTYPT",
"PPGROUPT",
"PAYPLAN",
"PAYPLANT",
"SALLVLT",
"TOATYPT",
"TOAT",
"WSTYP",
"WSTYPT",
"WORKSCH",
"WORKSCHT",
"SALARY",
"LOS",
"SEPCount_EFDATE_OCC",
"SEPCount_EFDATE_LOC"
]
#delete cols from analysis data
for col in cols:
if col in list(OPMAnalysisDataNoFamAdminBinary.columns):
del OPMAnalysisDataNoFamAdminBinary[col]
OPMAnalysisDataNoFamAdminBinary.info()
cols = ["AGELVL",
"LOC",
"SALLVL",
"TOA",
"AGYTYP",
"AGY",
"LOCTYP",
"PPTYP",
"PPGROUP",
"TOATYP"
]
#Split Values for cols
for col in cols:
if col in list(OPMAnalysisDataNoFamAdminBinary.columns):
AttSplit = pd.get_dummies(OPMAnalysisDataNoFamAdminBinary[col],prefix=col)
display(AttSplit.head())
OPMAnalysisDataNoFamAdminBinary = pd.concat((OPMAnalysisDataNoFamAdminBinary,AttSplit),axis=1) # add back into the dataframe
del OPMAnalysisDataNoFamAdminBinary[col]
pickleObject(OPMAnalysisDataNoFamAdminBinary, "OPMAnalysisDataNoFamAdminBinary")
display(OPMAnalysisDataNoFamAdminBinary.head())
print("Number of Columns: ",len(OPMAnalysisDataNoFamAdminBinary.columns))
OPMAnalysisDataNoFamAdminBinary.info()
if os.path.isfile(PickleJarPath+"/OPMAnalysisScalerFit.pkl"):
print("Found the File! Loading Pickle Now!")
OPMAnalysisScalerFit = unpickleObject("OPMAnalysisScalerFit")
#OPMAnalysisDataNoFamAdminBinary = OPMAnalysisDataNoFamAdmin[(OPMAnalysisDataNoFamAdmin["SEP"] == 'NS') | (OPMAnalysisDataNoFamAdmin["SEP"] == 'SC')].reset_index()
OPMAnalysisDataNoFamAdminBinaryScaled = OPMAnalysisDataNoFamAdminBinary[OPMScaledAnalysisData.columns]
print(OPMAnalysisDataNoFamAdminBinaryScaled.info())
%%time
OPMAnalysisDataNoFamAdminBinaryScaled = pd.DataFrame(OPMAnalysisScalerFit.transform(OPMAnalysisDataNoFamAdminBinaryScaled), columns = OPMAnalysisDataNoFamAdminBinaryScaled.columns)
print("Overall Accuracy, predicting Admin Binary Separation from Professional Model: ", knn_clf.score(OPMAnalysisDataNoFamAdminBinaryScaled[PCList], np.where(OPMAnalysisDataNoFamAdminBinary["SEP"]=="NS",0,1)))
results = pd.concat([OPMAnalysisDataNoFamAdminBinary, pd.DataFrame({"Prediction": knn_clf.predict(OPMAnalysisDataNoFamAdminBinaryScaled[PCList])})], axis = 1)
results["SEPNum"] = np.where(results["SEP"]=="NS",0,1)
results["PredictTxt"] = np.where(results["Prediction"]==0,"NS","SC")
display(pd.DataFrame({'Cnt' : results.groupby(["SEP"]).size()}).reset_index())
display(pd.DataFrame({'Cnt' : results.groupby(["SEP", "PredictTxt"]).size()}).reset_index())
print("confusion matrix\n{0}\n".format(pd.crosstab(results.PredictTxt, results.SEP, rownames = ['True'], colnames = ['Predicted'], margins = True)))
# Plot non-normalized confusion matrix
plt.figure()
plot_confusion_matrixBinary(confusion_matrix(results.Prediction, results.SEPNum),
classes =["NS", "SC"],
normalize =True,
title ='Confusion matrix, with normalization')
%%time
print("Overall Accuracy, predicting Admin Binary Separation from Professional Model: ", lr_clf.score(OPMAnalysisDataNoFamAdminBinaryScaled[fullColumns], np.where(OPMAnalysisDataNoFamAdminBinary["SEP"]=="NS",0,1)))
results = pd.concat([OPMAnalysisDataNoFamAdminBinary, pd.DataFrame({"Prediction": lr_clf.predict(OPMAnalysisDataNoFamAdminBinaryScaled[fullColumns])})], axis = 1)
results["SEPNum"] = np.where(results["SEP"]=="NS",0,1)
results["PredictTxt"] = np.where(results["Prediction"]==0,"NS","SC")
display(pd.DataFrame({'Cnt' : results.groupby(["SEP"]).size()}).reset_index())
display(pd.DataFrame({'Cnt' : results.groupby(["SEP", "PredictTxt"]).size()}).reset_index())
print("confusion matrix\n{0}\n".format(pd.crosstab(results.PredictTxt, results.SEP, rownames = ['True'], colnames = ['Predicted'], margins = True)))
# Plot non-normalized confusion matrix
plt.figure()
plot_confusion_matrixBinary(confusion_matrix(results.Prediction, results.SEPNum),
classes =["NS", "SC"],
normalize =True,
title ='Confusion matrix, with normalization')